diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18132 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997583723964626, + "eval_steps": 500, + "global_step": 2586, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 47.279807180167374, + "learning_rate": 1.282051282051282e-07, + "loss": 12.1123, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 29.234797993521592, + "learning_rate": 2.564102564102564e-07, + "loss": 12.1377, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 48.013302305052015, + "learning_rate": 3.846153846153847e-07, + "loss": 12.0928, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 49.648225967434605, + "learning_rate": 5.128205128205128e-07, + "loss": 12.1123, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 41.50977099753674, + "learning_rate": 6.41025641025641e-07, + "loss": 12.0527, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 43.3095044824057, + "learning_rate": 7.692307692307694e-07, + "loss": 12.0547, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 31.569078860107826, + "learning_rate": 8.974358974358975e-07, + "loss": 12.1328, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 42.593654219739186, + "learning_rate": 1.0256410256410257e-06, + "loss": 12.0615, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 33.585740478581016, + "learning_rate": 1.153846153846154e-06, + "loss": 12.0371, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 39.77975641784306, + "learning_rate": 1.282051282051282e-06, + "loss": 12.001, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 23.23163778179969, + "learning_rate": 1.4102564102564104e-06, + "loss": 12.1406, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 35.197855381569674, + "learning_rate": 1.5384615384615387e-06, + "loss": 11.9727, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 35.28289473472646, + "learning_rate": 1.6666666666666667e-06, + "loss": 11.8623, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 22.964433832401195, + "learning_rate": 1.794871794871795e-06, + "loss": 11.8867, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 32.38348517511878, + "learning_rate": 1.9230769230769234e-06, + "loss": 11.7998, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 21.734040594578076, + "learning_rate": 2.0512820512820513e-06, + "loss": 11.7617, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 25.19673160652617, + "learning_rate": 2.1794871794871797e-06, + "loss": 11.6992, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 29.296531551199227, + "learning_rate": 2.307692307692308e-06, + "loss": 11.5586, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 46.13003375968801, + "learning_rate": 2.435897435897436e-06, + "loss": 11.5527, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 41.849036854455164, + "learning_rate": 2.564102564102564e-06, + "loss": 11.3779, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 39.537236073998145, + "learning_rate": 2.6923076923076923e-06, + "loss": 11.2012, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 34.57172567317474, + "learning_rate": 2.8205128205128207e-06, + "loss": 11.0215, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 506.69228168760816, + "learning_rate": 2.948717948717949e-06, + "loss": 12.9199, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 111.64311280501529, + "learning_rate": 3.0769230769230774e-06, + "loss": 10.7695, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 316.1715029609909, + "learning_rate": 3.205128205128206e-06, + "loss": 12.2754, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 73.89514507362225, + "learning_rate": 3.3333333333333333e-06, + "loss": 10.127, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 182.78134072736862, + "learning_rate": 3.4615384615384617e-06, + "loss": 10.1006, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 254.93513728578208, + "learning_rate": 3.58974358974359e-06, + "loss": 10.3701, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 148.58471389468403, + "learning_rate": 3.7179487179487184e-06, + "loss": 8.9219, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 108.77056549970071, + "learning_rate": 3.846153846153847e-06, + "loss": 9.1855, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 103.87787595236031, + "learning_rate": 3.974358974358974e-06, + "loss": 8.1724, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 102.31703480340848, + "learning_rate": 4.102564102564103e-06, + "loss": 6.9956, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 172.46373641294485, + "learning_rate": 4.230769230769231e-06, + "loss": 6.3066, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 87.5216207739019, + "learning_rate": 4.358974358974359e-06, + "loss": 5.5588, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 80.50686987902449, + "learning_rate": 4.487179487179488e-06, + "loss": 4.576, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 57.645390934076, + "learning_rate": 4.615384615384616e-06, + "loss": 3.7121, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 37.675481020558486, + "learning_rate": 4.743589743589744e-06, + "loss": 3.5498, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 27.516266863848507, + "learning_rate": 4.871794871794872e-06, + "loss": 3.0763, + "step": 38 + }, + { + "epoch": 0.02, + "grad_norm": 30.54432213720788, + "learning_rate": 5e-06, + "loss": 2.5987, + "step": 39 + }, + { + "epoch": 0.02, + "grad_norm": 25.419156365403072, + "learning_rate": 5.128205128205128e-06, + "loss": 2.578, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 22.94976417729944, + "learning_rate": 5.256410256410257e-06, + "loss": 2.5072, + "step": 41 + }, + { + "epoch": 0.02, + "grad_norm": 19.709663468695748, + "learning_rate": 5.384615384615385e-06, + "loss": 2.2178, + "step": 42 + }, + { + "epoch": 0.02, + "grad_norm": 17.624104489751552, + "learning_rate": 5.512820512820514e-06, + "loss": 1.9278, + "step": 43 + }, + { + "epoch": 0.02, + "grad_norm": 18.649553341368065, + "learning_rate": 5.641025641025641e-06, + "loss": 1.8549, + "step": 44 + }, + { + "epoch": 0.02, + "grad_norm": 54.778542254207636, + "learning_rate": 5.769230769230769e-06, + "loss": 2.0851, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 16.034705703498012, + "learning_rate": 5.897435897435898e-06, + "loss": 1.5248, + "step": 46 + }, + { + "epoch": 0.02, + "grad_norm": 17.61319779654908, + "learning_rate": 6.025641025641026e-06, + "loss": 1.5049, + "step": 47 + }, + { + "epoch": 0.02, + "grad_norm": 17.013952030578054, + "learning_rate": 6.153846153846155e-06, + "loss": 1.5219, + "step": 48 + }, + { + "epoch": 0.02, + "grad_norm": 20.364691223375658, + "learning_rate": 6.282051282051282e-06, + "loss": 1.2063, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 15.841850856770195, + "learning_rate": 6.410256410256412e-06, + "loss": 1.5725, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 14.029717943966945, + "learning_rate": 6.538461538461539e-06, + "loss": 1.1939, + "step": 51 + }, + { + "epoch": 0.02, + "grad_norm": 34.08674532825822, + "learning_rate": 6.666666666666667e-06, + "loss": 1.0203, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 16.254122121541265, + "learning_rate": 6.794871794871796e-06, + "loss": 0.9564, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 17.88736857446396, + "learning_rate": 6.923076923076923e-06, + "loss": 1.443, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 13.852289912395646, + "learning_rate": 7.051282051282053e-06, + "loss": 1.1138, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 20.262325334929987, + "learning_rate": 7.17948717948718e-06, + "loss": 0.9219, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 15.85832036827058, + "learning_rate": 7.307692307692308e-06, + "loss": 0.8468, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 13.689120246753362, + "learning_rate": 7.435897435897437e-06, + "loss": 1.2285, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 12.411046871864485, + "learning_rate": 7.564102564102564e-06, + "loss": 1.0675, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 13.193089449601917, + "learning_rate": 7.692307692307694e-06, + "loss": 1.2942, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 11.848711317418662, + "learning_rate": 7.820512820512822e-06, + "loss": 0.7793, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 11.254231397561643, + "learning_rate": 7.948717948717949e-06, + "loss": 1.0076, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 11.224081466208796, + "learning_rate": 8.076923076923077e-06, + "loss": 0.7495, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 19.198184160612673, + "learning_rate": 8.205128205128205e-06, + "loss": 0.7391, + "step": 64 + }, + { + "epoch": 0.03, + "grad_norm": 12.603751018340077, + "learning_rate": 8.333333333333334e-06, + "loss": 1.1272, + "step": 65 + }, + { + "epoch": 0.03, + "grad_norm": 10.648011495220087, + "learning_rate": 8.461538461538462e-06, + "loss": 0.9172, + "step": 66 + }, + { + "epoch": 0.03, + "grad_norm": 12.597663609747956, + "learning_rate": 8.58974358974359e-06, + "loss": 1.0398, + "step": 67 + }, + { + "epoch": 0.03, + "grad_norm": 11.80888600284641, + "learning_rate": 8.717948717948719e-06, + "loss": 0.8381, + "step": 68 + }, + { + "epoch": 0.03, + "grad_norm": 12.265720281737142, + "learning_rate": 8.846153846153847e-06, + "loss": 1.0565, + "step": 69 + }, + { + "epoch": 0.03, + "grad_norm": 10.327563244701436, + "learning_rate": 8.974358974358976e-06, + "loss": 0.7898, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 11.548464366276166, + "learning_rate": 9.102564102564104e-06, + "loss": 1.0255, + "step": 71 + }, + { + "epoch": 0.03, + "grad_norm": 9.654369764317849, + "learning_rate": 9.230769230769232e-06, + "loss": 1.1506, + "step": 72 + }, + { + "epoch": 0.03, + "grad_norm": 10.733536199369656, + "learning_rate": 9.358974358974359e-06, + "loss": 0.896, + "step": 73 + }, + { + "epoch": 0.03, + "grad_norm": 13.074795791781662, + "learning_rate": 9.487179487179487e-06, + "loss": 1.217, + "step": 74 + }, + { + "epoch": 0.03, + "grad_norm": 10.301488347706664, + "learning_rate": 9.615384615384616e-06, + "loss": 0.8091, + "step": 75 + }, + { + "epoch": 0.03, + "grad_norm": 7.895001720574654, + "learning_rate": 9.743589743589744e-06, + "loss": 0.8315, + "step": 76 + }, + { + "epoch": 0.03, + "grad_norm": 11.33028882065257, + "learning_rate": 9.871794871794872e-06, + "loss": 0.9963, + "step": 77 + }, + { + "epoch": 0.03, + "grad_norm": 11.031516567458516, + "learning_rate": 1e-05, + "loss": 1.2161, + "step": 78 + }, + { + "epoch": 0.03, + "grad_norm": 10.548517188835804, + "learning_rate": 9.999996077304179e-06, + "loss": 0.8166, + "step": 79 + }, + { + "epoch": 0.03, + "grad_norm": 8.368075026226792, + "learning_rate": 9.999984309222866e-06, + "loss": 0.5367, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 10.017734661169415, + "learning_rate": 9.999964695774527e-06, + "loss": 0.7728, + "step": 81 + }, + { + "epoch": 0.03, + "grad_norm": 12.113244343569303, + "learning_rate": 9.99993723698994e-06, + "loss": 0.9339, + "step": 82 + }, + { + "epoch": 0.03, + "grad_norm": 10.808266144501516, + "learning_rate": 9.999901932912188e-06, + "loss": 0.591, + "step": 83 + }, + { + "epoch": 0.03, + "grad_norm": 15.135445364844132, + "learning_rate": 9.999858783596665e-06, + "loss": 0.9249, + "step": 84 + }, + { + "epoch": 0.03, + "grad_norm": 10.530424283253378, + "learning_rate": 9.999807789111075e-06, + "loss": 0.643, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 10.558115278249298, + "learning_rate": 9.999748949535436e-06, + "loss": 1.0037, + "step": 86 + }, + { + "epoch": 0.03, + "grad_norm": 11.46631822932675, + "learning_rate": 9.99968226496207e-06, + "loss": 0.9391, + "step": 87 + }, + { + "epoch": 0.03, + "grad_norm": 9.533803457871283, + "learning_rate": 9.999607735495609e-06, + "loss": 0.4386, + "step": 88 + }, + { + "epoch": 0.03, + "grad_norm": 9.575415222368854, + "learning_rate": 9.999525361252996e-06, + "loss": 0.4872, + "step": 89 + }, + { + "epoch": 0.03, + "grad_norm": 12.862480239438767, + "learning_rate": 9.999435142363484e-06, + "loss": 0.997, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 11.265338092891238, + "learning_rate": 9.999337078968633e-06, + "loss": 0.4861, + "step": 91 + }, + { + "epoch": 0.04, + "grad_norm": 6.813132934737774, + "learning_rate": 9.999231171222312e-06, + "loss": 0.4695, + "step": 92 + }, + { + "epoch": 0.04, + "grad_norm": 10.494494367234264, + "learning_rate": 9.999117419290698e-06, + "loss": 0.9507, + "step": 93 + }, + { + "epoch": 0.04, + "grad_norm": 11.29731472008422, + "learning_rate": 9.998995823352276e-06, + "loss": 1.0835, + "step": 94 + }, + { + "epoch": 0.04, + "grad_norm": 9.973003505943797, + "learning_rate": 9.998866383597842e-06, + "loss": 0.6378, + "step": 95 + }, + { + "epoch": 0.04, + "grad_norm": 10.378602523102169, + "learning_rate": 9.998729100230497e-06, + "loss": 1.014, + "step": 96 + }, + { + "epoch": 0.04, + "grad_norm": 8.339949662257846, + "learning_rate": 9.998583973465647e-06, + "loss": 0.3508, + "step": 97 + }, + { + "epoch": 0.04, + "grad_norm": 8.54455719121349, + "learning_rate": 9.998431003531008e-06, + "loss": 0.5341, + "step": 98 + }, + { + "epoch": 0.04, + "grad_norm": 7.967610734692199, + "learning_rate": 9.998270190666602e-06, + "loss": 0.5081, + "step": 99 + }, + { + "epoch": 0.04, + "grad_norm": 6.376613736854886, + "learning_rate": 9.998101535124758e-06, + "loss": 0.3209, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 8.601722192201839, + "learning_rate": 9.99792503717011e-06, + "loss": 0.4846, + "step": 101 + }, + { + "epoch": 0.04, + "grad_norm": 8.22359026928023, + "learning_rate": 9.997740697079595e-06, + "loss": 0.7585, + "step": 102 + }, + { + "epoch": 0.04, + "grad_norm": 8.040968238862975, + "learning_rate": 9.997548515142457e-06, + "loss": 0.5073, + "step": 103 + }, + { + "epoch": 0.04, + "grad_norm": 9.084715042296548, + "learning_rate": 9.997348491660247e-06, + "loss": 0.6983, + "step": 104 + }, + { + "epoch": 0.04, + "grad_norm": 7.4086997542404776, + "learning_rate": 9.997140626946815e-06, + "loss": 0.3524, + "step": 105 + }, + { + "epoch": 0.04, + "grad_norm": 11.077089617894178, + "learning_rate": 9.99692492132832e-06, + "loss": 0.6197, + "step": 106 + }, + { + "epoch": 0.04, + "grad_norm": 7.61237049405628, + "learning_rate": 9.996701375143216e-06, + "loss": 0.4866, + "step": 107 + }, + { + "epoch": 0.04, + "grad_norm": 7.230360068207208, + "learning_rate": 9.99646998874227e-06, + "loss": 0.5656, + "step": 108 + }, + { + "epoch": 0.04, + "grad_norm": 9.425448570927497, + "learning_rate": 9.996230762488544e-06, + "loss": 0.8637, + "step": 109 + }, + { + "epoch": 0.04, + "grad_norm": 13.744697312303776, + "learning_rate": 9.9959836967574e-06, + "loss": 0.5646, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 6.3022800651898585, + "learning_rate": 9.995728791936505e-06, + "loss": 0.3901, + "step": 111 + }, + { + "epoch": 0.04, + "grad_norm": 8.15302821871786, + "learning_rate": 9.995466048425825e-06, + "loss": 0.65, + "step": 112 + }, + { + "epoch": 0.04, + "grad_norm": 7.607942057954628, + "learning_rate": 9.995195466637629e-06, + "loss": 0.4055, + "step": 113 + }, + { + "epoch": 0.04, + "grad_norm": 7.6398697815205265, + "learning_rate": 9.994917046996472e-06, + "loss": 0.9344, + "step": 114 + }, + { + "epoch": 0.04, + "grad_norm": 9.122555689230712, + "learning_rate": 9.994630789939223e-06, + "loss": 0.8358, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 9.094408808052112, + "learning_rate": 9.994336695915041e-06, + "loss": 0.8405, + "step": 116 + }, + { + "epoch": 0.05, + "grad_norm": 9.05662306983693, + "learning_rate": 9.99403476538538e-06, + "loss": 0.8575, + "step": 117 + }, + { + "epoch": 0.05, + "grad_norm": 8.143200299838425, + "learning_rate": 9.993724998823995e-06, + "loss": 0.8526, + "step": 118 + }, + { + "epoch": 0.05, + "grad_norm": 9.508983052481922, + "learning_rate": 9.993407396716935e-06, + "loss": 0.7197, + "step": 119 + }, + { + "epoch": 0.05, + "grad_norm": 8.253313336966283, + "learning_rate": 9.993081959562539e-06, + "loss": 0.6145, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 8.400637991921391, + "learning_rate": 9.992748687871445e-06, + "loss": 0.686, + "step": 121 + }, + { + "epoch": 0.05, + "grad_norm": 5.266223281104883, + "learning_rate": 9.992407582166582e-06, + "loss": 0.2203, + "step": 122 + }, + { + "epoch": 0.05, + "grad_norm": 8.173864805127318, + "learning_rate": 9.992058642983173e-06, + "loss": 0.5649, + "step": 123 + }, + { + "epoch": 0.05, + "grad_norm": 9.134578213259644, + "learning_rate": 9.991701870868731e-06, + "loss": 0.8683, + "step": 124 + }, + { + "epoch": 0.05, + "grad_norm": 15.753200529590611, + "learning_rate": 9.991337266383056e-06, + "loss": 0.7512, + "step": 125 + }, + { + "epoch": 0.05, + "grad_norm": 9.693412664995114, + "learning_rate": 9.990964830098246e-06, + "loss": 0.7306, + "step": 126 + }, + { + "epoch": 0.05, + "grad_norm": 9.082364863469143, + "learning_rate": 9.990584562598679e-06, + "loss": 0.7836, + "step": 127 + }, + { + "epoch": 0.05, + "grad_norm": 8.491549029464382, + "learning_rate": 9.990196464481025e-06, + "loss": 0.5206, + "step": 128 + }, + { + "epoch": 0.05, + "grad_norm": 6.579409745338709, + "learning_rate": 9.989800536354243e-06, + "loss": 0.629, + "step": 129 + }, + { + "epoch": 0.05, + "grad_norm": 7.202005969283118, + "learning_rate": 9.989396778839572e-06, + "loss": 0.428, + "step": 130 + }, + { + "epoch": 0.05, + "grad_norm": 7.742828175807453, + "learning_rate": 9.988985192570541e-06, + "loss": 0.7628, + "step": 131 + }, + { + "epoch": 0.05, + "grad_norm": 6.611548082050718, + "learning_rate": 9.98856577819296e-06, + "loss": 0.4257, + "step": 132 + }, + { + "epoch": 0.05, + "grad_norm": 6.783167273856261, + "learning_rate": 9.988138536364922e-06, + "loss": 0.3008, + "step": 133 + }, + { + "epoch": 0.05, + "grad_norm": 9.184412983765748, + "learning_rate": 9.987703467756807e-06, + "loss": 0.7117, + "step": 134 + }, + { + "epoch": 0.05, + "grad_norm": 8.054224609301897, + "learning_rate": 9.987260573051268e-06, + "loss": 0.4991, + "step": 135 + }, + { + "epoch": 0.05, + "grad_norm": 8.54045338023512, + "learning_rate": 9.986809852943244e-06, + "loss": 0.3961, + "step": 136 + }, + { + "epoch": 0.05, + "grad_norm": 6.999202345977584, + "learning_rate": 9.986351308139948e-06, + "loss": 0.4433, + "step": 137 + }, + { + "epoch": 0.05, + "grad_norm": 8.657041187137718, + "learning_rate": 9.985884939360873e-06, + "loss": 0.7117, + "step": 138 + }, + { + "epoch": 0.05, + "grad_norm": 8.736255352268028, + "learning_rate": 9.985410747337788e-06, + "loss": 0.5681, + "step": 139 + }, + { + "epoch": 0.05, + "grad_norm": 7.4160416582186235, + "learning_rate": 9.984928732814738e-06, + "loss": 0.7026, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 9.551085654208427, + "learning_rate": 9.984438896548043e-06, + "loss": 0.8086, + "step": 141 + }, + { + "epoch": 0.05, + "grad_norm": 7.585721693914806, + "learning_rate": 9.983941239306291e-06, + "loss": 0.5382, + "step": 142 + }, + { + "epoch": 0.06, + "grad_norm": 7.509174347972325, + "learning_rate": 9.98343576187035e-06, + "loss": 0.5778, + "step": 143 + }, + { + "epoch": 0.06, + "grad_norm": 6.064880941306638, + "learning_rate": 9.98292246503335e-06, + "loss": 0.2871, + "step": 144 + }, + { + "epoch": 0.06, + "grad_norm": 10.19792899129946, + "learning_rate": 9.982401349600695e-06, + "loss": 1.0037, + "step": 145 + }, + { + "epoch": 0.06, + "grad_norm": 6.544744033275711, + "learning_rate": 9.981872416390055e-06, + "loss": 0.505, + "step": 146 + }, + { + "epoch": 0.06, + "grad_norm": 8.765816943542017, + "learning_rate": 9.981335666231369e-06, + "loss": 0.6552, + "step": 147 + }, + { + "epoch": 0.06, + "grad_norm": 9.94756159633514, + "learning_rate": 9.98079109996684e-06, + "loss": 0.8352, + "step": 148 + }, + { + "epoch": 0.06, + "grad_norm": 8.327165984682571, + "learning_rate": 9.980238718450934e-06, + "loss": 0.6213, + "step": 149 + }, + { + "epoch": 0.06, + "grad_norm": 6.680930771815867, + "learning_rate": 9.979678522550382e-06, + "loss": 0.561, + "step": 150 + }, + { + "epoch": 0.06, + "grad_norm": 9.015659229577425, + "learning_rate": 9.979110513144175e-06, + "loss": 0.8584, + "step": 151 + }, + { + "epoch": 0.06, + "grad_norm": 8.95101306790891, + "learning_rate": 9.978534691123563e-06, + "loss": 0.9032, + "step": 152 + }, + { + "epoch": 0.06, + "grad_norm": 9.458806183904773, + "learning_rate": 9.977951057392057e-06, + "loss": 0.7042, + "step": 153 + }, + { + "epoch": 0.06, + "grad_norm": 8.914539407204849, + "learning_rate": 9.977359612865424e-06, + "loss": 0.6532, + "step": 154 + }, + { + "epoch": 0.06, + "grad_norm": 9.14762479409611, + "learning_rate": 9.976760358471687e-06, + "loss": 0.5867, + "step": 155 + }, + { + "epoch": 0.06, + "grad_norm": 6.060015023394262, + "learning_rate": 9.976153295151123e-06, + "loss": 0.5223, + "step": 156 + }, + { + "epoch": 0.06, + "grad_norm": 8.214670267078661, + "learning_rate": 9.975538423856261e-06, + "loss": 0.7876, + "step": 157 + }, + { + "epoch": 0.06, + "grad_norm": 7.430010349609652, + "learning_rate": 9.974915745551882e-06, + "loss": 0.3879, + "step": 158 + }, + { + "epoch": 0.06, + "grad_norm": 9.079436310446846, + "learning_rate": 9.97428526121502e-06, + "loss": 0.9568, + "step": 159 + }, + { + "epoch": 0.06, + "grad_norm": 8.1641882247915, + "learning_rate": 9.97364697183495e-06, + "loss": 0.8483, + "step": 160 + }, + { + "epoch": 0.06, + "grad_norm": 6.245660186154265, + "learning_rate": 9.973000878413203e-06, + "loss": 0.5145, + "step": 161 + }, + { + "epoch": 0.06, + "grad_norm": 10.003192641435247, + "learning_rate": 9.972346981963546e-06, + "loss": 0.7765, + "step": 162 + }, + { + "epoch": 0.06, + "grad_norm": 5.705352774502806, + "learning_rate": 9.971685283511996e-06, + "loss": 0.2866, + "step": 163 + }, + { + "epoch": 0.06, + "grad_norm": 7.376060609181173, + "learning_rate": 9.971015784096808e-06, + "loss": 0.3711, + "step": 164 + }, + { + "epoch": 0.06, + "grad_norm": 6.075769514876911, + "learning_rate": 9.970338484768481e-06, + "loss": 0.4238, + "step": 165 + }, + { + "epoch": 0.06, + "grad_norm": 5.949963812119086, + "learning_rate": 9.969653386589749e-06, + "loss": 0.6609, + "step": 166 + }, + { + "epoch": 0.06, + "grad_norm": 6.650487883668113, + "learning_rate": 9.968960490635584e-06, + "loss": 0.7281, + "step": 167 + }, + { + "epoch": 0.06, + "grad_norm": 6.03687742377937, + "learning_rate": 9.968259797993197e-06, + "loss": 0.2753, + "step": 168 + }, + { + "epoch": 0.07, + "grad_norm": 7.007716350627047, + "learning_rate": 9.967551309762028e-06, + "loss": 0.5426, + "step": 169 + }, + { + "epoch": 0.07, + "grad_norm": 7.353735627412365, + "learning_rate": 9.96683502705375e-06, + "loss": 0.7178, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 7.649303693881235, + "learning_rate": 9.966110950992267e-06, + "loss": 0.4387, + "step": 171 + }, + { + "epoch": 0.07, + "grad_norm": 5.989185317191312, + "learning_rate": 9.965379082713711e-06, + "loss": 0.3955, + "step": 172 + }, + { + "epoch": 0.07, + "grad_norm": 8.616226128228174, + "learning_rate": 9.964639423366442e-06, + "loss": 0.6876, + "step": 173 + }, + { + "epoch": 0.07, + "grad_norm": 6.0596544890992305, + "learning_rate": 9.963891974111042e-06, + "loss": 0.5467, + "step": 174 + }, + { + "epoch": 0.07, + "grad_norm": 6.981627770800435, + "learning_rate": 9.963136736120318e-06, + "loss": 0.6744, + "step": 175 + }, + { + "epoch": 0.07, + "grad_norm": 8.219503583871042, + "learning_rate": 9.962373710579296e-06, + "loss": 0.5867, + "step": 176 + }, + { + "epoch": 0.07, + "grad_norm": 5.75735079747755, + "learning_rate": 9.961602898685225e-06, + "loss": 0.3495, + "step": 177 + }, + { + "epoch": 0.07, + "grad_norm": 6.1393224951719985, + "learning_rate": 9.960824301647569e-06, + "loss": 0.459, + "step": 178 + }, + { + "epoch": 0.07, + "grad_norm": 8.521809955844176, + "learning_rate": 9.960037920688007e-06, + "loss": 0.6667, + "step": 179 + }, + { + "epoch": 0.07, + "grad_norm": 8.849867961965746, + "learning_rate": 9.959243757040434e-06, + "loss": 1.0293, + "step": 180 + }, + { + "epoch": 0.07, + "grad_norm": 6.035407322874203, + "learning_rate": 9.958441811950952e-06, + "loss": 0.4224, + "step": 181 + }, + { + "epoch": 0.07, + "grad_norm": 6.48672035787583, + "learning_rate": 9.957632086677876e-06, + "loss": 0.4148, + "step": 182 + }, + { + "epoch": 0.07, + "grad_norm": 6.95586573225404, + "learning_rate": 9.956814582491731e-06, + "loss": 0.4398, + "step": 183 + }, + { + "epoch": 0.07, + "grad_norm": 7.175368456311246, + "learning_rate": 9.955989300675243e-06, + "loss": 0.6539, + "step": 184 + }, + { + "epoch": 0.07, + "grad_norm": 8.349612594988935, + "learning_rate": 9.955156242523345e-06, + "loss": 0.404, + "step": 185 + }, + { + "epoch": 0.07, + "grad_norm": 10.043658422156373, + "learning_rate": 9.95431540934317e-06, + "loss": 0.8843, + "step": 186 + }, + { + "epoch": 0.07, + "grad_norm": 6.611942060388665, + "learning_rate": 9.95346680245405e-06, + "loss": 0.2997, + "step": 187 + }, + { + "epoch": 0.07, + "grad_norm": 7.327339649145689, + "learning_rate": 9.952610423187516e-06, + "loss": 0.4541, + "step": 188 + }, + { + "epoch": 0.07, + "grad_norm": 7.636493879004018, + "learning_rate": 9.951746272887298e-06, + "loss": 0.601, + "step": 189 + }, + { + "epoch": 0.07, + "grad_norm": 8.154432748014298, + "learning_rate": 9.95087435290931e-06, + "loss": 0.6419, + "step": 190 + }, + { + "epoch": 0.07, + "grad_norm": 6.807187869480093, + "learning_rate": 9.949994664621664e-06, + "loss": 0.7136, + "step": 191 + }, + { + "epoch": 0.07, + "grad_norm": 7.89695756697466, + "learning_rate": 9.949107209404664e-06, + "loss": 0.6911, + "step": 192 + }, + { + "epoch": 0.07, + "grad_norm": 8.63703021104669, + "learning_rate": 9.948211988650794e-06, + "loss": 0.513, + "step": 193 + }, + { + "epoch": 0.08, + "grad_norm": 7.303030646490855, + "learning_rate": 9.947309003764723e-06, + "loss": 0.63, + "step": 194 + }, + { + "epoch": 0.08, + "grad_norm": 7.4080767213233685, + "learning_rate": 9.946398256163307e-06, + "loss": 0.8587, + "step": 195 + }, + { + "epoch": 0.08, + "grad_norm": 7.178629015840178, + "learning_rate": 9.94547974727558e-06, + "loss": 0.3381, + "step": 196 + }, + { + "epoch": 0.08, + "grad_norm": 7.192474431550429, + "learning_rate": 9.944553478542757e-06, + "loss": 0.7226, + "step": 197 + }, + { + "epoch": 0.08, + "grad_norm": 6.254971047700981, + "learning_rate": 9.943619451418225e-06, + "loss": 0.562, + "step": 198 + }, + { + "epoch": 0.08, + "grad_norm": 6.066029622620954, + "learning_rate": 9.942677667367541e-06, + "loss": 0.4646, + "step": 199 + }, + { + "epoch": 0.08, + "grad_norm": 7.633734882218207, + "learning_rate": 9.941728127868446e-06, + "loss": 0.4237, + "step": 200 + }, + { + "epoch": 0.08, + "grad_norm": 6.8942365470381946, + "learning_rate": 9.940770834410836e-06, + "loss": 0.6326, + "step": 201 + }, + { + "epoch": 0.08, + "grad_norm": 6.496701523436003, + "learning_rate": 9.939805788496778e-06, + "loss": 0.6076, + "step": 202 + }, + { + "epoch": 0.08, + "grad_norm": 6.523333549756454, + "learning_rate": 9.938832991640512e-06, + "loss": 0.6535, + "step": 203 + }, + { + "epoch": 0.08, + "grad_norm": 5.855352384023266, + "learning_rate": 9.937852445368427e-06, + "loss": 0.3365, + "step": 204 + }, + { + "epoch": 0.08, + "grad_norm": 6.558584636294461, + "learning_rate": 9.936864151219077e-06, + "loss": 0.5154, + "step": 205 + }, + { + "epoch": 0.08, + "grad_norm": 8.366101209111038, + "learning_rate": 9.935868110743175e-06, + "loss": 0.8519, + "step": 206 + }, + { + "epoch": 0.08, + "grad_norm": 6.322957013995513, + "learning_rate": 9.934864325503584e-06, + "loss": 0.5198, + "step": 207 + }, + { + "epoch": 0.08, + "grad_norm": 5.426973359648164, + "learning_rate": 9.933852797075325e-06, + "loss": 0.5909, + "step": 208 + }, + { + "epoch": 0.08, + "grad_norm": 7.3776468368540336, + "learning_rate": 9.932833527045563e-06, + "loss": 0.7068, + "step": 209 + }, + { + "epoch": 0.08, + "grad_norm": 7.203666798541823, + "learning_rate": 9.931806517013612e-06, + "loss": 0.2836, + "step": 210 + }, + { + "epoch": 0.08, + "grad_norm": 7.448120072789972, + "learning_rate": 9.930771768590934e-06, + "loss": 0.6617, + "step": 211 + }, + { + "epoch": 0.08, + "grad_norm": 7.140414903559803, + "learning_rate": 9.929729283401127e-06, + "loss": 1.1004, + "step": 212 + }, + { + "epoch": 0.08, + "grad_norm": 8.471338878333157, + "learning_rate": 9.928679063079934e-06, + "loss": 1.0521, + "step": 213 + }, + { + "epoch": 0.08, + "grad_norm": 7.674920645877626, + "learning_rate": 9.927621109275233e-06, + "loss": 0.667, + "step": 214 + }, + { + "epoch": 0.08, + "grad_norm": 7.34475789270995, + "learning_rate": 9.926555423647035e-06, + "loss": 0.561, + "step": 215 + }, + { + "epoch": 0.08, + "grad_norm": 8.257567861186612, + "learning_rate": 9.925482007867485e-06, + "loss": 0.5936, + "step": 216 + }, + { + "epoch": 0.08, + "grad_norm": 7.560031013337673, + "learning_rate": 9.924400863620857e-06, + "loss": 0.4272, + "step": 217 + }, + { + "epoch": 0.08, + "grad_norm": 6.224823857790595, + "learning_rate": 9.92331199260355e-06, + "loss": 0.3991, + "step": 218 + }, + { + "epoch": 0.08, + "grad_norm": 7.517561386732611, + "learning_rate": 9.922215396524089e-06, + "loss": 0.8067, + "step": 219 + }, + { + "epoch": 0.09, + "grad_norm": 7.32107399129161, + "learning_rate": 9.921111077103118e-06, + "loss": 0.8056, + "step": 220 + }, + { + "epoch": 0.09, + "grad_norm": 6.723309673859593, + "learning_rate": 9.9199990360734e-06, + "loss": 0.4147, + "step": 221 + }, + { + "epoch": 0.09, + "grad_norm": 8.235724496677339, + "learning_rate": 9.918879275179819e-06, + "loss": 0.7469, + "step": 222 + }, + { + "epoch": 0.09, + "grad_norm": 6.6575004399317415, + "learning_rate": 9.91775179617936e-06, + "loss": 0.3459, + "step": 223 + }, + { + "epoch": 0.09, + "grad_norm": 10.460870124299543, + "learning_rate": 9.916616600841133e-06, + "loss": 0.9322, + "step": 224 + }, + { + "epoch": 0.09, + "grad_norm": 8.10853460812157, + "learning_rate": 9.915473690946345e-06, + "loss": 0.7995, + "step": 225 + }, + { + "epoch": 0.09, + "grad_norm": 7.695166873286982, + "learning_rate": 9.914323068288312e-06, + "loss": 0.6969, + "step": 226 + }, + { + "epoch": 0.09, + "grad_norm": 6.022883524436682, + "learning_rate": 9.91316473467245e-06, + "loss": 0.2709, + "step": 227 + }, + { + "epoch": 0.09, + "grad_norm": 7.906177789166639, + "learning_rate": 9.911998691916275e-06, + "loss": 0.7719, + "step": 228 + }, + { + "epoch": 0.09, + "grad_norm": 8.201805471092074, + "learning_rate": 9.910824941849401e-06, + "loss": 0.4027, + "step": 229 + }, + { + "epoch": 0.09, + "grad_norm": 8.6378632826228, + "learning_rate": 9.909643486313533e-06, + "loss": 0.9008, + "step": 230 + }, + { + "epoch": 0.09, + "grad_norm": 8.619090369092227, + "learning_rate": 9.908454327162469e-06, + "loss": 0.7992, + "step": 231 + }, + { + "epoch": 0.09, + "grad_norm": 6.267889278364657, + "learning_rate": 9.90725746626209e-06, + "loss": 0.5345, + "step": 232 + }, + { + "epoch": 0.09, + "grad_norm": 6.113464436748009, + "learning_rate": 9.906052905490366e-06, + "loss": 0.5715, + "step": 233 + }, + { + "epoch": 0.09, + "grad_norm": 8.371927237132624, + "learning_rate": 9.904840646737346e-06, + "loss": 0.501, + "step": 234 + }, + { + "epoch": 0.09, + "grad_norm": 7.5624418707969285, + "learning_rate": 9.90362069190516e-06, + "loss": 0.5762, + "step": 235 + }, + { + "epoch": 0.09, + "grad_norm": 7.488590672245439, + "learning_rate": 9.902393042908015e-06, + "loss": 0.4949, + "step": 236 + }, + { + "epoch": 0.09, + "grad_norm": 6.720823201186881, + "learning_rate": 9.901157701672183e-06, + "loss": 0.4122, + "step": 237 + }, + { + "epoch": 0.09, + "grad_norm": 5.787613369924378, + "learning_rate": 9.899914670136016e-06, + "loss": 0.7468, + "step": 238 + }, + { + "epoch": 0.09, + "grad_norm": 6.214927476209819, + "learning_rate": 9.898663950249925e-06, + "loss": 0.3431, + "step": 239 + }, + { + "epoch": 0.09, + "grad_norm": 5.727400564621301, + "learning_rate": 9.89740554397639e-06, + "loss": 0.5952, + "step": 240 + }, + { + "epoch": 0.09, + "grad_norm": 6.402217775362403, + "learning_rate": 9.896139453289946e-06, + "loss": 0.6448, + "step": 241 + }, + { + "epoch": 0.09, + "grad_norm": 8.585103963636099, + "learning_rate": 9.89486568017719e-06, + "loss": 0.4152, + "step": 242 + }, + { + "epoch": 0.09, + "grad_norm": 8.647479694264918, + "learning_rate": 9.893584226636773e-06, + "loss": 0.8351, + "step": 243 + }, + { + "epoch": 0.09, + "grad_norm": 6.632200926220455, + "learning_rate": 9.892295094679394e-06, + "loss": 0.9451, + "step": 244 + }, + { + "epoch": 0.09, + "grad_norm": 7.414039713352258, + "learning_rate": 9.890998286327802e-06, + "loss": 0.6671, + "step": 245 + }, + { + "epoch": 0.1, + "grad_norm": 6.900483823935903, + "learning_rate": 9.889693803616793e-06, + "loss": 0.55, + "step": 246 + }, + { + "epoch": 0.1, + "grad_norm": 6.593605468779421, + "learning_rate": 9.8883816485932e-06, + "loss": 0.6476, + "step": 247 + }, + { + "epoch": 0.1, + "grad_norm": 6.609562634073377, + "learning_rate": 9.8870618233159e-06, + "loss": 0.6162, + "step": 248 + }, + { + "epoch": 0.1, + "grad_norm": 9.011815477328181, + "learning_rate": 9.885734329855798e-06, + "loss": 0.7807, + "step": 249 + }, + { + "epoch": 0.1, + "grad_norm": 6.26524254560341, + "learning_rate": 9.884399170295839e-06, + "loss": 0.2472, + "step": 250 + }, + { + "epoch": 0.1, + "grad_norm": 5.335370910528567, + "learning_rate": 9.883056346730992e-06, + "loss": 0.6568, + "step": 251 + }, + { + "epoch": 0.1, + "grad_norm": 6.997695191369208, + "learning_rate": 9.881705861268252e-06, + "loss": 0.7309, + "step": 252 + }, + { + "epoch": 0.1, + "grad_norm": 5.126963578159049, + "learning_rate": 9.880347716026635e-06, + "loss": 0.4084, + "step": 253 + }, + { + "epoch": 0.1, + "grad_norm": 7.079237445427333, + "learning_rate": 9.878981913137178e-06, + "loss": 0.788, + "step": 254 + }, + { + "epoch": 0.1, + "grad_norm": 6.049948294026413, + "learning_rate": 9.877608454742935e-06, + "loss": 0.6085, + "step": 255 + }, + { + "epoch": 0.1, + "grad_norm": 6.257499830934716, + "learning_rate": 9.87622734299897e-06, + "loss": 0.6666, + "step": 256 + }, + { + "epoch": 0.1, + "grad_norm": 7.651887244077257, + "learning_rate": 9.87483858007235e-06, + "loss": 0.4793, + "step": 257 + }, + { + "epoch": 0.1, + "grad_norm": 6.043785471550064, + "learning_rate": 9.873442168142158e-06, + "loss": 0.6279, + "step": 258 + }, + { + "epoch": 0.1, + "grad_norm": 7.158571435308082, + "learning_rate": 9.872038109399474e-06, + "loss": 0.734, + "step": 259 + }, + { + "epoch": 0.1, + "grad_norm": 6.798279585084529, + "learning_rate": 9.870626406047372e-06, + "loss": 0.7056, + "step": 260 + }, + { + "epoch": 0.1, + "grad_norm": 6.80425883777596, + "learning_rate": 9.869207060300929e-06, + "loss": 0.5749, + "step": 261 + }, + { + "epoch": 0.1, + "grad_norm": 7.590455161475363, + "learning_rate": 9.867780074387207e-06, + "loss": 0.2604, + "step": 262 + }, + { + "epoch": 0.1, + "grad_norm": 5.525151836428044, + "learning_rate": 9.86634545054526e-06, + "loss": 0.267, + "step": 263 + }, + { + "epoch": 0.1, + "grad_norm": 5.708055135034483, + "learning_rate": 9.864903191026125e-06, + "loss": 0.2953, + "step": 264 + }, + { + "epoch": 0.1, + "grad_norm": 6.3983965002891905, + "learning_rate": 9.86345329809282e-06, + "loss": 0.5049, + "step": 265 + }, + { + "epoch": 0.1, + "grad_norm": 6.781667712208913, + "learning_rate": 9.861995774020341e-06, + "loss": 0.656, + "step": 266 + }, + { + "epoch": 0.1, + "grad_norm": 6.289483360206045, + "learning_rate": 9.860530621095659e-06, + "loss": 0.6584, + "step": 267 + }, + { + "epoch": 0.1, + "grad_norm": 6.0602707296292895, + "learning_rate": 9.859057841617709e-06, + "loss": 0.4042, + "step": 268 + }, + { + "epoch": 0.1, + "grad_norm": 20.867771447040084, + "learning_rate": 9.857577437897401e-06, + "loss": 0.7279, + "step": 269 + }, + { + "epoch": 0.1, + "grad_norm": 8.68379750893328, + "learning_rate": 9.856089412257605e-06, + "loss": 0.7704, + "step": 270 + }, + { + "epoch": 0.1, + "grad_norm": 7.956331836477761, + "learning_rate": 9.854593767033147e-06, + "loss": 0.746, + "step": 271 + }, + { + "epoch": 0.11, + "grad_norm": 7.383533768646651, + "learning_rate": 9.853090504570813e-06, + "loss": 0.5509, + "step": 272 + }, + { + "epoch": 0.11, + "grad_norm": 6.46597403560279, + "learning_rate": 9.85157962722934e-06, + "loss": 0.6701, + "step": 273 + }, + { + "epoch": 0.11, + "grad_norm": 8.613120765745608, + "learning_rate": 9.850061137379414e-06, + "loss": 0.7618, + "step": 274 + }, + { + "epoch": 0.11, + "grad_norm": 7.300109956666041, + "learning_rate": 9.84853503740366e-06, + "loss": 0.4096, + "step": 275 + }, + { + "epoch": 0.11, + "grad_norm": 6.885673288118975, + "learning_rate": 9.847001329696653e-06, + "loss": 0.4864, + "step": 276 + }, + { + "epoch": 0.11, + "grad_norm": 7.191105323565091, + "learning_rate": 9.845460016664899e-06, + "loss": 0.3362, + "step": 277 + }, + { + "epoch": 0.11, + "grad_norm": 6.775653473869663, + "learning_rate": 9.843911100726838e-06, + "loss": 0.6733, + "step": 278 + }, + { + "epoch": 0.11, + "grad_norm": 9.646672616812669, + "learning_rate": 9.842354584312841e-06, + "loss": 1.1503, + "step": 279 + }, + { + "epoch": 0.11, + "grad_norm": 6.893398851266994, + "learning_rate": 9.840790469865205e-06, + "loss": 0.5574, + "step": 280 + }, + { + "epoch": 0.11, + "grad_norm": 6.8364563178155295, + "learning_rate": 9.839218759838146e-06, + "loss": 0.541, + "step": 281 + }, + { + "epoch": 0.11, + "grad_norm": 8.942051863781865, + "learning_rate": 9.837639456697802e-06, + "loss": 0.3568, + "step": 282 + }, + { + "epoch": 0.11, + "grad_norm": 7.876918111726609, + "learning_rate": 9.836052562922224e-06, + "loss": 0.3939, + "step": 283 + }, + { + "epoch": 0.11, + "grad_norm": 5.939547804165729, + "learning_rate": 9.83445808100137e-06, + "loss": 0.2633, + "step": 284 + }, + { + "epoch": 0.11, + "grad_norm": 9.043850201881881, + "learning_rate": 9.83285601343711e-06, + "loss": 0.3511, + "step": 285 + }, + { + "epoch": 0.11, + "grad_norm": 6.61884688616227, + "learning_rate": 9.83124636274321e-06, + "loss": 0.6302, + "step": 286 + }, + { + "epoch": 0.11, + "grad_norm": 6.039344836134989, + "learning_rate": 9.829629131445342e-06, + "loss": 0.3386, + "step": 287 + }, + { + "epoch": 0.11, + "grad_norm": 6.956129723796397, + "learning_rate": 9.828004322081067e-06, + "loss": 0.4755, + "step": 288 + }, + { + "epoch": 0.11, + "grad_norm": 6.5635099147781375, + "learning_rate": 9.826371937199837e-06, + "loss": 0.6223, + "step": 289 + }, + { + "epoch": 0.11, + "grad_norm": 5.435171788762695, + "learning_rate": 9.824731979362991e-06, + "loss": 0.4679, + "step": 290 + }, + { + "epoch": 0.11, + "grad_norm": 7.412793395992515, + "learning_rate": 9.823084451143755e-06, + "loss": 0.5032, + "step": 291 + }, + { + "epoch": 0.11, + "grad_norm": 5.2313885489579075, + "learning_rate": 9.821429355127225e-06, + "loss": 0.4112, + "step": 292 + }, + { + "epoch": 0.11, + "grad_norm": 7.831319321123592, + "learning_rate": 9.819766693910382e-06, + "loss": 0.5151, + "step": 293 + }, + { + "epoch": 0.11, + "grad_norm": 4.879006356515538, + "learning_rate": 9.818096470102067e-06, + "loss": 0.4978, + "step": 294 + }, + { + "epoch": 0.11, + "grad_norm": 7.035423996978485, + "learning_rate": 9.816418686322995e-06, + "loss": 0.4944, + "step": 295 + }, + { + "epoch": 0.11, + "grad_norm": 8.641312697525722, + "learning_rate": 9.814733345205736e-06, + "loss": 0.8333, + "step": 296 + }, + { + "epoch": 0.11, + "grad_norm": 4.845084661500621, + "learning_rate": 9.813040449394728e-06, + "loss": 0.3966, + "step": 297 + }, + { + "epoch": 0.12, + "grad_norm": 8.171572463981029, + "learning_rate": 9.811340001546252e-06, + "loss": 0.5276, + "step": 298 + }, + { + "epoch": 0.12, + "grad_norm": 5.921858936764608, + "learning_rate": 9.809632004328448e-06, + "loss": 0.5579, + "step": 299 + }, + { + "epoch": 0.12, + "grad_norm": 6.145090683302827, + "learning_rate": 9.807916460421294e-06, + "loss": 0.5193, + "step": 300 + }, + { + "epoch": 0.12, + "grad_norm": 5.836341525658862, + "learning_rate": 9.806193372516615e-06, + "loss": 0.4026, + "step": 301 + }, + { + "epoch": 0.12, + "grad_norm": 6.581487511831342, + "learning_rate": 9.80446274331807e-06, + "loss": 0.4996, + "step": 302 + }, + { + "epoch": 0.12, + "grad_norm": 6.255141502498269, + "learning_rate": 9.802724575541152e-06, + "loss": 0.6947, + "step": 303 + }, + { + "epoch": 0.12, + "grad_norm": 9.386386426049397, + "learning_rate": 9.800978871913181e-06, + "loss": 0.3353, + "step": 304 + }, + { + "epoch": 0.12, + "grad_norm": 6.538415251290061, + "learning_rate": 9.799225635173305e-06, + "loss": 0.5751, + "step": 305 + }, + { + "epoch": 0.12, + "grad_norm": 7.326703067594579, + "learning_rate": 9.797464868072489e-06, + "loss": 0.6409, + "step": 306 + }, + { + "epoch": 0.12, + "grad_norm": 7.840338885309457, + "learning_rate": 9.795696573373511e-06, + "loss": 0.4939, + "step": 307 + }, + { + "epoch": 0.12, + "grad_norm": 6.812226879320205, + "learning_rate": 9.793920753850972e-06, + "loss": 0.5296, + "step": 308 + }, + { + "epoch": 0.12, + "grad_norm": 6.284231927305214, + "learning_rate": 9.792137412291265e-06, + "loss": 0.3649, + "step": 309 + }, + { + "epoch": 0.12, + "grad_norm": 9.326199089079676, + "learning_rate": 9.790346551492594e-06, + "loss": 0.5225, + "step": 310 + }, + { + "epoch": 0.12, + "grad_norm": 5.60346439100645, + "learning_rate": 9.788548174264961e-06, + "loss": 0.3588, + "step": 311 + }, + { + "epoch": 0.12, + "grad_norm": 6.089527981344691, + "learning_rate": 9.78674228343016e-06, + "loss": 0.4737, + "step": 312 + }, + { + "epoch": 0.12, + "grad_norm": 6.651164249429784, + "learning_rate": 9.784928881821776e-06, + "loss": 0.5429, + "step": 313 + }, + { + "epoch": 0.12, + "grad_norm": 8.42810315949908, + "learning_rate": 9.783107972285177e-06, + "loss": 0.5282, + "step": 314 + }, + { + "epoch": 0.12, + "grad_norm": 5.4585265485361045, + "learning_rate": 9.781279557677514e-06, + "loss": 0.4829, + "step": 315 + }, + { + "epoch": 0.12, + "grad_norm": 4.873939546653713, + "learning_rate": 9.779443640867712e-06, + "loss": 0.3218, + "step": 316 + }, + { + "epoch": 0.12, + "grad_norm": 7.271643730021547, + "learning_rate": 9.777600224736468e-06, + "loss": 0.7987, + "step": 317 + }, + { + "epoch": 0.12, + "grad_norm": 6.862633180212283, + "learning_rate": 9.775749312176249e-06, + "loss": 0.8568, + "step": 318 + }, + { + "epoch": 0.12, + "grad_norm": 5.214642683741037, + "learning_rate": 9.773890906091275e-06, + "loss": 0.4015, + "step": 319 + }, + { + "epoch": 0.12, + "grad_norm": 4.97107359956487, + "learning_rate": 9.772025009397538e-06, + "loss": 0.3971, + "step": 320 + }, + { + "epoch": 0.12, + "grad_norm": 5.433346044875373, + "learning_rate": 9.770151625022772e-06, + "loss": 0.3841, + "step": 321 + }, + { + "epoch": 0.12, + "grad_norm": 8.943137012235509, + "learning_rate": 9.768270755906467e-06, + "loss": 0.5485, + "step": 322 + }, + { + "epoch": 0.12, + "grad_norm": 5.649512258026864, + "learning_rate": 9.76638240499985e-06, + "loss": 0.3617, + "step": 323 + }, + { + "epoch": 0.13, + "grad_norm": 8.23323791908471, + "learning_rate": 9.764486575265893e-06, + "loss": 0.7007, + "step": 324 + }, + { + "epoch": 0.13, + "grad_norm": 6.567604255024964, + "learning_rate": 9.762583269679304e-06, + "loss": 0.6144, + "step": 325 + }, + { + "epoch": 0.13, + "grad_norm": 4.892714103609171, + "learning_rate": 9.760672491226515e-06, + "loss": 0.2174, + "step": 326 + }, + { + "epoch": 0.13, + "grad_norm": 5.774746583839811, + "learning_rate": 9.758754242905688e-06, + "loss": 0.7154, + "step": 327 + }, + { + "epoch": 0.13, + "grad_norm": 5.714795038248178, + "learning_rate": 9.756828527726706e-06, + "loss": 0.5746, + "step": 328 + }, + { + "epoch": 0.13, + "grad_norm": 7.471646883651928, + "learning_rate": 9.754895348711167e-06, + "loss": 0.54, + "step": 329 + }, + { + "epoch": 0.13, + "grad_norm": 8.989112148176543, + "learning_rate": 9.752954708892379e-06, + "loss": 0.7058, + "step": 330 + }, + { + "epoch": 0.13, + "grad_norm": 6.928347523538836, + "learning_rate": 9.751006611315357e-06, + "loss": 0.7443, + "step": 331 + }, + { + "epoch": 0.13, + "grad_norm": 7.585908113982128, + "learning_rate": 9.749051059036821e-06, + "loss": 0.456, + "step": 332 + }, + { + "epoch": 0.13, + "grad_norm": 8.117243531401913, + "learning_rate": 9.747088055125186e-06, + "loss": 0.5869, + "step": 333 + }, + { + "epoch": 0.13, + "grad_norm": 5.6263618774567155, + "learning_rate": 9.745117602660556e-06, + "loss": 0.3162, + "step": 334 + }, + { + "epoch": 0.13, + "grad_norm": 4.707229023801819, + "learning_rate": 9.743139704734729e-06, + "loss": 0.6052, + "step": 335 + }, + { + "epoch": 0.13, + "grad_norm": 7.237336641601359, + "learning_rate": 9.741154364451179e-06, + "loss": 0.5417, + "step": 336 + }, + { + "epoch": 0.13, + "grad_norm": 5.41298988622075, + "learning_rate": 9.739161584925061e-06, + "loss": 0.2447, + "step": 337 + }, + { + "epoch": 0.13, + "grad_norm": 5.849373058789376, + "learning_rate": 9.737161369283201e-06, + "loss": 0.4595, + "step": 338 + }, + { + "epoch": 0.13, + "grad_norm": 5.35682340061978, + "learning_rate": 9.735153720664096e-06, + "loss": 0.6239, + "step": 339 + }, + { + "epoch": 0.13, + "grad_norm": 8.969091935864224, + "learning_rate": 9.733138642217905e-06, + "loss": 0.7563, + "step": 340 + }, + { + "epoch": 0.13, + "grad_norm": 8.45712541102906, + "learning_rate": 9.731116137106441e-06, + "loss": 0.7126, + "step": 341 + }, + { + "epoch": 0.13, + "grad_norm": 9.377006008503654, + "learning_rate": 9.729086208503174e-06, + "loss": 0.3643, + "step": 342 + }, + { + "epoch": 0.13, + "grad_norm": 7.066581448477598, + "learning_rate": 9.727048859593223e-06, + "loss": 0.6599, + "step": 343 + }, + { + "epoch": 0.13, + "grad_norm": 8.254306487939065, + "learning_rate": 9.725004093573343e-06, + "loss": 0.6487, + "step": 344 + }, + { + "epoch": 0.13, + "grad_norm": 6.392083079398413, + "learning_rate": 9.722951913651937e-06, + "loss": 0.3145, + "step": 345 + }, + { + "epoch": 0.13, + "grad_norm": 7.235957620735297, + "learning_rate": 9.720892323049034e-06, + "loss": 0.4961, + "step": 346 + }, + { + "epoch": 0.13, + "grad_norm": 5.345313997047523, + "learning_rate": 9.718825324996294e-06, + "loss": 0.4739, + "step": 347 + }, + { + "epoch": 0.13, + "grad_norm": 6.745936546272732, + "learning_rate": 9.716750922736998e-06, + "loss": 0.6252, + "step": 348 + }, + { + "epoch": 0.13, + "grad_norm": 10.734311635581271, + "learning_rate": 9.714669119526043e-06, + "loss": 0.2562, + "step": 349 + }, + { + "epoch": 0.14, + "grad_norm": 6.704455470536539, + "learning_rate": 9.712579918629947e-06, + "loss": 0.5932, + "step": 350 + }, + { + "epoch": 0.14, + "grad_norm": 8.321438502428611, + "learning_rate": 9.710483323326826e-06, + "loss": 0.6632, + "step": 351 + }, + { + "epoch": 0.14, + "grad_norm": 5.984339774346785, + "learning_rate": 9.708379336906404e-06, + "loss": 0.3536, + "step": 352 + }, + { + "epoch": 0.14, + "grad_norm": 6.11816595055312, + "learning_rate": 9.706267962669999e-06, + "loss": 0.2847, + "step": 353 + }, + { + "epoch": 0.14, + "grad_norm": 5.958987770737341, + "learning_rate": 9.704149203930522e-06, + "loss": 0.4137, + "step": 354 + }, + { + "epoch": 0.14, + "grad_norm": 6.50824457960139, + "learning_rate": 9.702023064012473e-06, + "loss": 0.5092, + "step": 355 + }, + { + "epoch": 0.14, + "grad_norm": 7.5318192556780845, + "learning_rate": 9.699889546251934e-06, + "loss": 0.506, + "step": 356 + }, + { + "epoch": 0.14, + "grad_norm": 4.665264867994612, + "learning_rate": 9.697748653996558e-06, + "loss": 0.4056, + "step": 357 + }, + { + "epoch": 0.14, + "grad_norm": 6.591022259272521, + "learning_rate": 9.695600390605573e-06, + "loss": 0.4226, + "step": 358 + }, + { + "epoch": 0.14, + "grad_norm": 7.44892332271176, + "learning_rate": 9.693444759449772e-06, + "loss": 0.7158, + "step": 359 + }, + { + "epoch": 0.14, + "grad_norm": 4.528700345746769, + "learning_rate": 9.691281763911513e-06, + "loss": 0.1974, + "step": 360 + }, + { + "epoch": 0.14, + "grad_norm": 6.440396887673768, + "learning_rate": 9.689111407384703e-06, + "loss": 0.6358, + "step": 361 + }, + { + "epoch": 0.14, + "grad_norm": 6.263669295314319, + "learning_rate": 9.686933693274801e-06, + "loss": 0.4228, + "step": 362 + }, + { + "epoch": 0.14, + "grad_norm": 6.7863239124259795, + "learning_rate": 9.68474862499881e-06, + "loss": 0.7634, + "step": 363 + }, + { + "epoch": 0.14, + "grad_norm": 7.544318757098548, + "learning_rate": 9.682556205985274e-06, + "loss": 0.7168, + "step": 364 + }, + { + "epoch": 0.14, + "grad_norm": 7.915395816252159, + "learning_rate": 9.680356439674272e-06, + "loss": 0.6839, + "step": 365 + }, + { + "epoch": 0.14, + "grad_norm": 4.734733387774487, + "learning_rate": 9.67814932951741e-06, + "loss": 0.2306, + "step": 366 + }, + { + "epoch": 0.14, + "grad_norm": 4.8398546251233645, + "learning_rate": 9.675934878977814e-06, + "loss": 0.469, + "step": 367 + }, + { + "epoch": 0.14, + "grad_norm": 6.854795901737554, + "learning_rate": 9.67371309153013e-06, + "loss": 0.9874, + "step": 368 + }, + { + "epoch": 0.14, + "grad_norm": 6.637104635529683, + "learning_rate": 9.671483970660519e-06, + "loss": 0.514, + "step": 369 + }, + { + "epoch": 0.14, + "grad_norm": 5.846763652901588, + "learning_rate": 9.669247519866645e-06, + "loss": 0.6784, + "step": 370 + }, + { + "epoch": 0.14, + "grad_norm": 7.824369828623507, + "learning_rate": 9.667003742657674e-06, + "loss": 0.7653, + "step": 371 + }, + { + "epoch": 0.14, + "grad_norm": 6.407640051164882, + "learning_rate": 9.664752642554272e-06, + "loss": 0.4574, + "step": 372 + }, + { + "epoch": 0.14, + "grad_norm": 5.629447184503669, + "learning_rate": 9.662494223088586e-06, + "loss": 0.4313, + "step": 373 + }, + { + "epoch": 0.14, + "grad_norm": 8.59917689662875, + "learning_rate": 9.660228487804254e-06, + "loss": 0.1838, + "step": 374 + }, + { + "epoch": 0.14, + "grad_norm": 5.598790128111294, + "learning_rate": 9.657955440256396e-06, + "loss": 0.4146, + "step": 375 + }, + { + "epoch": 0.15, + "grad_norm": 6.6925984270154135, + "learning_rate": 9.655675084011597e-06, + "loss": 0.6849, + "step": 376 + }, + { + "epoch": 0.15, + "grad_norm": 6.231477605073852, + "learning_rate": 9.653387422647918e-06, + "loss": 0.3639, + "step": 377 + }, + { + "epoch": 0.15, + "grad_norm": 8.381814285110513, + "learning_rate": 9.651092459754879e-06, + "loss": 0.6685, + "step": 378 + }, + { + "epoch": 0.15, + "grad_norm": 6.552681803361334, + "learning_rate": 9.648790198933452e-06, + "loss": 0.2848, + "step": 379 + }, + { + "epoch": 0.15, + "grad_norm": 5.34831473491917, + "learning_rate": 9.64648064379607e-06, + "loss": 0.5052, + "step": 380 + }, + { + "epoch": 0.15, + "grad_norm": 7.151382078570767, + "learning_rate": 9.644163797966604e-06, + "loss": 0.6374, + "step": 381 + }, + { + "epoch": 0.15, + "grad_norm": 4.673448235622143, + "learning_rate": 9.641839665080363e-06, + "loss": 0.3938, + "step": 382 + }, + { + "epoch": 0.15, + "grad_norm": 6.451456369553214, + "learning_rate": 9.6395082487841e-06, + "loss": 0.2418, + "step": 383 + }, + { + "epoch": 0.15, + "grad_norm": 6.835227594492041, + "learning_rate": 9.637169552735985e-06, + "loss": 0.646, + "step": 384 + }, + { + "epoch": 0.15, + "grad_norm": 4.611919642055182, + "learning_rate": 9.634823580605616e-06, + "loss": 0.2649, + "step": 385 + }, + { + "epoch": 0.15, + "grad_norm": 5.999553603810167, + "learning_rate": 9.632470336074009e-06, + "loss": 0.6146, + "step": 386 + }, + { + "epoch": 0.15, + "grad_norm": 4.88211570975364, + "learning_rate": 9.63010982283359e-06, + "loss": 0.4935, + "step": 387 + }, + { + "epoch": 0.15, + "grad_norm": 5.969216796160313, + "learning_rate": 9.627742044588183e-06, + "loss": 0.4057, + "step": 388 + }, + { + "epoch": 0.15, + "grad_norm": 7.275824770459739, + "learning_rate": 9.625367005053023e-06, + "loss": 0.2308, + "step": 389 + }, + { + "epoch": 0.15, + "grad_norm": 6.116035677820355, + "learning_rate": 9.622984707954732e-06, + "loss": 0.2201, + "step": 390 + }, + { + "epoch": 0.15, + "grad_norm": 5.6867022195529024, + "learning_rate": 9.62059515703132e-06, + "loss": 0.3027, + "step": 391 + }, + { + "epoch": 0.15, + "grad_norm": 7.22863003774383, + "learning_rate": 9.61819835603218e-06, + "loss": 0.4913, + "step": 392 + }, + { + "epoch": 0.15, + "grad_norm": 4.565778187479567, + "learning_rate": 9.61579430871808e-06, + "loss": 0.4912, + "step": 393 + }, + { + "epoch": 0.15, + "grad_norm": 6.457178472171472, + "learning_rate": 9.613383018861159e-06, + "loss": 0.4607, + "step": 394 + }, + { + "epoch": 0.15, + "grad_norm": 6.041290869912107, + "learning_rate": 9.610964490244921e-06, + "loss": 0.349, + "step": 395 + }, + { + "epoch": 0.15, + "grad_norm": 6.947518287356081, + "learning_rate": 9.608538726664224e-06, + "loss": 0.5991, + "step": 396 + }, + { + "epoch": 0.15, + "grad_norm": 6.511793169400645, + "learning_rate": 9.606105731925284e-06, + "loss": 0.6116, + "step": 397 + }, + { + "epoch": 0.15, + "grad_norm": 4.675740378847589, + "learning_rate": 9.603665509845657e-06, + "loss": 0.2027, + "step": 398 + }, + { + "epoch": 0.15, + "grad_norm": 4.026549637984747, + "learning_rate": 9.601218064254245e-06, + "loss": 0.438, + "step": 399 + }, + { + "epoch": 0.15, + "grad_norm": 7.778939427784003, + "learning_rate": 9.598763398991283e-06, + "loss": 0.768, + "step": 400 + }, + { + "epoch": 0.16, + "grad_norm": 7.793963249204283, + "learning_rate": 9.596301517908329e-06, + "loss": 0.7986, + "step": 401 + }, + { + "epoch": 0.16, + "grad_norm": 5.71313794103704, + "learning_rate": 9.593832424868271e-06, + "loss": 0.7055, + "step": 402 + }, + { + "epoch": 0.16, + "grad_norm": 6.614542708882127, + "learning_rate": 9.591356123745307e-06, + "loss": 0.6463, + "step": 403 + }, + { + "epoch": 0.16, + "grad_norm": 5.041038055336509, + "learning_rate": 9.588872618424949e-06, + "loss": 0.5673, + "step": 404 + }, + { + "epoch": 0.16, + "grad_norm": 5.405384635242469, + "learning_rate": 9.58638191280401e-06, + "loss": 0.2453, + "step": 405 + }, + { + "epoch": 0.16, + "grad_norm": 6.20641170842102, + "learning_rate": 9.583884010790605e-06, + "loss": 0.6098, + "step": 406 + }, + { + "epoch": 0.16, + "grad_norm": 9.614248272427313, + "learning_rate": 9.581378916304134e-06, + "loss": 0.5737, + "step": 407 + }, + { + "epoch": 0.16, + "grad_norm": 7.763469821060781, + "learning_rate": 9.578866633275289e-06, + "loss": 0.3714, + "step": 408 + }, + { + "epoch": 0.16, + "grad_norm": 5.7575888291758375, + "learning_rate": 9.576347165646038e-06, + "loss": 0.7211, + "step": 409 + }, + { + "epoch": 0.16, + "grad_norm": 5.33997986745691, + "learning_rate": 9.573820517369623e-06, + "loss": 0.3097, + "step": 410 + }, + { + "epoch": 0.16, + "grad_norm": 5.274820287363962, + "learning_rate": 9.571286692410553e-06, + "loss": 0.276, + "step": 411 + }, + { + "epoch": 0.16, + "grad_norm": 6.817705244436653, + "learning_rate": 9.568745694744599e-06, + "loss": 0.6004, + "step": 412 + }, + { + "epoch": 0.16, + "grad_norm": 6.148630748957115, + "learning_rate": 9.566197528358785e-06, + "loss": 0.6065, + "step": 413 + }, + { + "epoch": 0.16, + "grad_norm": 7.519936676810737, + "learning_rate": 9.563642197251382e-06, + "loss": 0.5844, + "step": 414 + }, + { + "epoch": 0.16, + "grad_norm": 6.687868076343516, + "learning_rate": 9.561079705431907e-06, + "loss": 0.3832, + "step": 415 + }, + { + "epoch": 0.16, + "grad_norm": 6.231937884394182, + "learning_rate": 9.55851005692111e-06, + "loss": 0.5117, + "step": 416 + }, + { + "epoch": 0.16, + "grad_norm": 6.046340559624703, + "learning_rate": 9.555933255750968e-06, + "loss": 0.3931, + "step": 417 + }, + { + "epoch": 0.16, + "grad_norm": 7.316621022594312, + "learning_rate": 9.553349305964687e-06, + "loss": 0.6415, + "step": 418 + }, + { + "epoch": 0.16, + "grad_norm": 5.5185681614742395, + "learning_rate": 9.550758211616684e-06, + "loss": 0.6544, + "step": 419 + }, + { + "epoch": 0.16, + "grad_norm": 7.106881598647304, + "learning_rate": 9.548159976772593e-06, + "loss": 0.2595, + "step": 420 + }, + { + "epoch": 0.16, + "grad_norm": 5.923890852527746, + "learning_rate": 9.545554605509244e-06, + "loss": 0.5632, + "step": 421 + }, + { + "epoch": 0.16, + "grad_norm": 5.8413519974174575, + "learning_rate": 9.54294210191467e-06, + "loss": 0.2704, + "step": 422 + }, + { + "epoch": 0.16, + "grad_norm": 5.435662954282161, + "learning_rate": 9.540322470088094e-06, + "loss": 0.3175, + "step": 423 + }, + { + "epoch": 0.16, + "grad_norm": 6.8171840997736455, + "learning_rate": 9.537695714139925e-06, + "loss": 0.5396, + "step": 424 + }, + { + "epoch": 0.16, + "grad_norm": 5.937493762708229, + "learning_rate": 9.535061838191746e-06, + "loss": 0.387, + "step": 425 + }, + { + "epoch": 0.16, + "grad_norm": 5.553249957249877, + "learning_rate": 9.532420846376316e-06, + "loss": 0.1648, + "step": 426 + }, + { + "epoch": 0.17, + "grad_norm": 5.989119236935052, + "learning_rate": 9.529772742837557e-06, + "loss": 0.7881, + "step": 427 + }, + { + "epoch": 0.17, + "grad_norm": 3.912295985527433, + "learning_rate": 9.527117531730552e-06, + "loss": 0.4188, + "step": 428 + }, + { + "epoch": 0.17, + "grad_norm": 5.481371641086089, + "learning_rate": 9.524455217221537e-06, + "loss": 0.6939, + "step": 429 + }, + { + "epoch": 0.17, + "grad_norm": 5.336352083185281, + "learning_rate": 9.521785803487888e-06, + "loss": 0.4077, + "step": 430 + }, + { + "epoch": 0.17, + "grad_norm": 5.42636333137998, + "learning_rate": 9.519109294718127e-06, + "loss": 0.5134, + "step": 431 + }, + { + "epoch": 0.17, + "grad_norm": 5.5944462019201575, + "learning_rate": 9.516425695111906e-06, + "loss": 0.3965, + "step": 432 + }, + { + "epoch": 0.17, + "grad_norm": 7.282364249353646, + "learning_rate": 9.513735008880001e-06, + "loss": 0.4672, + "step": 433 + }, + { + "epoch": 0.17, + "grad_norm": 7.4145928824884, + "learning_rate": 9.51103724024431e-06, + "loss": 0.7779, + "step": 434 + }, + { + "epoch": 0.17, + "grad_norm": 5.533429081064421, + "learning_rate": 9.508332393437845e-06, + "loss": 0.4267, + "step": 435 + }, + { + "epoch": 0.17, + "grad_norm": 6.71155794896511, + "learning_rate": 9.505620472704721e-06, + "loss": 0.5358, + "step": 436 + }, + { + "epoch": 0.17, + "grad_norm": 5.648989084603511, + "learning_rate": 9.502901482300155e-06, + "loss": 0.7728, + "step": 437 + }, + { + "epoch": 0.17, + "grad_norm": 6.161754705078291, + "learning_rate": 9.500175426490455e-06, + "loss": 0.5708, + "step": 438 + }, + { + "epoch": 0.17, + "grad_norm": 6.095891733875877, + "learning_rate": 9.497442309553017e-06, + "loss": 0.4843, + "step": 439 + }, + { + "epoch": 0.17, + "grad_norm": 6.181342270774325, + "learning_rate": 9.494702135776315e-06, + "loss": 0.7018, + "step": 440 + }, + { + "epoch": 0.17, + "grad_norm": 4.086856274717312, + "learning_rate": 9.491954909459895e-06, + "loss": 0.155, + "step": 441 + }, + { + "epoch": 0.17, + "grad_norm": 5.586954161640706, + "learning_rate": 9.489200634914373e-06, + "loss": 0.591, + "step": 442 + }, + { + "epoch": 0.17, + "grad_norm": 5.640634780202634, + "learning_rate": 9.48643931646142e-06, + "loss": 0.6774, + "step": 443 + }, + { + "epoch": 0.17, + "grad_norm": 5.675825617162014, + "learning_rate": 9.48367095843376e-06, + "loss": 0.2631, + "step": 444 + }, + { + "epoch": 0.17, + "grad_norm": 5.066578740033582, + "learning_rate": 9.480895565175166e-06, + "loss": 0.3844, + "step": 445 + }, + { + "epoch": 0.17, + "grad_norm": 6.009829294775996, + "learning_rate": 9.478113141040444e-06, + "loss": 0.3959, + "step": 446 + }, + { + "epoch": 0.17, + "grad_norm": 4.229930106917378, + "learning_rate": 9.475323690395439e-06, + "loss": 0.3022, + "step": 447 + }, + { + "epoch": 0.17, + "grad_norm": 4.788266587913981, + "learning_rate": 9.472527217617016e-06, + "loss": 0.5625, + "step": 448 + }, + { + "epoch": 0.17, + "grad_norm": 3.720053712994818, + "learning_rate": 9.469723727093061e-06, + "loss": 0.4241, + "step": 449 + }, + { + "epoch": 0.17, + "grad_norm": 6.114279692875901, + "learning_rate": 9.466913223222467e-06, + "loss": 0.7787, + "step": 450 + }, + { + "epoch": 0.17, + "grad_norm": 6.488837694516971, + "learning_rate": 9.464095710415138e-06, + "loss": 0.5442, + "step": 451 + }, + { + "epoch": 0.17, + "grad_norm": 5.884916870489228, + "learning_rate": 9.461271193091971e-06, + "loss": 0.4661, + "step": 452 + }, + { + "epoch": 0.18, + "grad_norm": 6.123702868706546, + "learning_rate": 9.458439675684854e-06, + "loss": 0.5812, + "step": 453 + }, + { + "epoch": 0.18, + "grad_norm": 7.776381518764753, + "learning_rate": 9.455601162636662e-06, + "loss": 0.4924, + "step": 454 + }, + { + "epoch": 0.18, + "grad_norm": 6.250665275980305, + "learning_rate": 9.452755658401243e-06, + "loss": 0.5599, + "step": 455 + }, + { + "epoch": 0.18, + "grad_norm": 5.813101213507731, + "learning_rate": 9.449903167443415e-06, + "loss": 0.8663, + "step": 456 + }, + { + "epoch": 0.18, + "grad_norm": 8.017585706041487, + "learning_rate": 9.447043694238961e-06, + "loss": 0.4124, + "step": 457 + }, + { + "epoch": 0.18, + "grad_norm": 4.5162848791395644, + "learning_rate": 9.444177243274619e-06, + "loss": 0.4879, + "step": 458 + }, + { + "epoch": 0.18, + "grad_norm": 4.799692304435781, + "learning_rate": 9.441303819048073e-06, + "loss": 0.1965, + "step": 459 + }, + { + "epoch": 0.18, + "grad_norm": 4.663076169983753, + "learning_rate": 9.438423426067953e-06, + "loss": 0.4772, + "step": 460 + }, + { + "epoch": 0.18, + "grad_norm": 5.7057492095885465, + "learning_rate": 9.435536068853819e-06, + "loss": 0.6837, + "step": 461 + }, + { + "epoch": 0.18, + "grad_norm": 5.576537087092552, + "learning_rate": 9.432641751936162e-06, + "loss": 0.5122, + "step": 462 + }, + { + "epoch": 0.18, + "grad_norm": 6.802315042476752, + "learning_rate": 9.42974047985639e-06, + "loss": 0.6463, + "step": 463 + }, + { + "epoch": 0.18, + "grad_norm": 4.965963334451556, + "learning_rate": 9.426832257166831e-06, + "loss": 0.2304, + "step": 464 + }, + { + "epoch": 0.18, + "grad_norm": 6.49349567152424, + "learning_rate": 9.42391708843071e-06, + "loss": 0.6584, + "step": 465 + }, + { + "epoch": 0.18, + "grad_norm": 4.956976424505751, + "learning_rate": 9.420994978222156e-06, + "loss": 0.2796, + "step": 466 + }, + { + "epoch": 0.18, + "grad_norm": 6.895187589079934, + "learning_rate": 9.418065931126188e-06, + "loss": 0.7422, + "step": 467 + }, + { + "epoch": 0.18, + "grad_norm": 5.162009865187619, + "learning_rate": 9.415129951738713e-06, + "loss": 0.7592, + "step": 468 + }, + { + "epoch": 0.18, + "grad_norm": 7.074943467654599, + "learning_rate": 9.41218704466651e-06, + "loss": 0.6322, + "step": 469 + }, + { + "epoch": 0.18, + "grad_norm": 8.489214335013715, + "learning_rate": 9.40923721452723e-06, + "loss": 0.6547, + "step": 470 + }, + { + "epoch": 0.18, + "grad_norm": 7.262757783995714, + "learning_rate": 9.406280465949394e-06, + "loss": 0.3637, + "step": 471 + }, + { + "epoch": 0.18, + "grad_norm": 5.0907325669614325, + "learning_rate": 9.403316803572363e-06, + "loss": 0.5202, + "step": 472 + }, + { + "epoch": 0.18, + "grad_norm": 5.885826552650618, + "learning_rate": 9.400346232046361e-06, + "loss": 0.7693, + "step": 473 + }, + { + "epoch": 0.18, + "grad_norm": 5.757160363287021, + "learning_rate": 9.397368756032445e-06, + "loss": 0.3856, + "step": 474 + }, + { + "epoch": 0.18, + "grad_norm": 5.178620343935924, + "learning_rate": 9.394384380202511e-06, + "loss": 0.4919, + "step": 475 + }, + { + "epoch": 0.18, + "grad_norm": 5.044643698327802, + "learning_rate": 9.391393109239277e-06, + "loss": 0.3934, + "step": 476 + }, + { + "epoch": 0.18, + "grad_norm": 4.03917926402934, + "learning_rate": 9.388394947836278e-06, + "loss": 0.4205, + "step": 477 + }, + { + "epoch": 0.18, + "grad_norm": 5.6081217690733896, + "learning_rate": 9.38538990069787e-06, + "loss": 0.3395, + "step": 478 + }, + { + "epoch": 0.19, + "grad_norm": 5.431102669647372, + "learning_rate": 9.382377972539202e-06, + "loss": 0.3721, + "step": 479 + }, + { + "epoch": 0.19, + "grad_norm": 6.359552945387467, + "learning_rate": 9.379359168086231e-06, + "loss": 0.7253, + "step": 480 + }, + { + "epoch": 0.19, + "grad_norm": 5.68792187512187, + "learning_rate": 9.376333492075692e-06, + "loss": 0.7079, + "step": 481 + }, + { + "epoch": 0.19, + "grad_norm": 6.167717896122742, + "learning_rate": 9.373300949255112e-06, + "loss": 0.7042, + "step": 482 + }, + { + "epoch": 0.19, + "grad_norm": 6.831407302067921, + "learning_rate": 9.370261544382784e-06, + "loss": 0.5675, + "step": 483 + }, + { + "epoch": 0.19, + "grad_norm": 7.095153001002084, + "learning_rate": 9.367215282227775e-06, + "loss": 0.3116, + "step": 484 + }, + { + "epoch": 0.19, + "grad_norm": 4.897250530963122, + "learning_rate": 9.364162167569907e-06, + "loss": 0.4143, + "step": 485 + }, + { + "epoch": 0.19, + "grad_norm": 5.53624010008374, + "learning_rate": 9.361102205199762e-06, + "loss": 0.4532, + "step": 486 + }, + { + "epoch": 0.19, + "grad_norm": 5.5500232887782, + "learning_rate": 9.358035399918652e-06, + "loss": 0.3851, + "step": 487 + }, + { + "epoch": 0.19, + "grad_norm": 6.024281018095578, + "learning_rate": 9.35496175653864e-06, + "loss": 0.7731, + "step": 488 + }, + { + "epoch": 0.19, + "grad_norm": 6.298614201489692, + "learning_rate": 9.351881279882512e-06, + "loss": 0.4516, + "step": 489 + }, + { + "epoch": 0.19, + "grad_norm": 5.102037939841859, + "learning_rate": 9.348793974783778e-06, + "loss": 0.5464, + "step": 490 + }, + { + "epoch": 0.19, + "grad_norm": 4.810046538133617, + "learning_rate": 9.34569984608666e-06, + "loss": 0.5021, + "step": 491 + }, + { + "epoch": 0.19, + "grad_norm": 4.09677561170902, + "learning_rate": 9.34259889864609e-06, + "loss": 0.3154, + "step": 492 + }, + { + "epoch": 0.19, + "grad_norm": 5.462968183574109, + "learning_rate": 9.339491137327696e-06, + "loss": 0.3365, + "step": 493 + }, + { + "epoch": 0.19, + "grad_norm": 5.076862513783476, + "learning_rate": 9.336376567007799e-06, + "loss": 0.2389, + "step": 494 + }, + { + "epoch": 0.19, + "grad_norm": 5.472436045042487, + "learning_rate": 9.333255192573404e-06, + "loss": 0.4488, + "step": 495 + }, + { + "epoch": 0.19, + "grad_norm": 6.458578628390376, + "learning_rate": 9.330127018922195e-06, + "loss": 0.5609, + "step": 496 + }, + { + "epoch": 0.19, + "grad_norm": 5.035193804293063, + "learning_rate": 9.326992050962515e-06, + "loss": 0.4033, + "step": 497 + }, + { + "epoch": 0.19, + "grad_norm": 4.278576512322674, + "learning_rate": 9.32385029361338e-06, + "loss": 0.4599, + "step": 498 + }, + { + "epoch": 0.19, + "grad_norm": 5.9099550443679, + "learning_rate": 9.320701751804451e-06, + "loss": 0.2679, + "step": 499 + }, + { + "epoch": 0.19, + "grad_norm": 6.160666606684506, + "learning_rate": 9.317546430476036e-06, + "loss": 0.345, + "step": 500 + }, + { + "epoch": 0.19, + "grad_norm": 6.390224524129515, + "learning_rate": 9.314384334579085e-06, + "loss": 0.7871, + "step": 501 + }, + { + "epoch": 0.19, + "grad_norm": 5.169113614638828, + "learning_rate": 9.311215469075168e-06, + "loss": 0.5507, + "step": 502 + }, + { + "epoch": 0.19, + "grad_norm": 4.505867716803198, + "learning_rate": 9.30803983893649e-06, + "loss": 0.4485, + "step": 503 + }, + { + "epoch": 0.19, + "grad_norm": 4.997989343457168, + "learning_rate": 9.304857449145858e-06, + "loss": 0.224, + "step": 504 + }, + { + "epoch": 0.2, + "grad_norm": 5.9889268179994275, + "learning_rate": 9.301668304696694e-06, + "loss": 0.4886, + "step": 505 + }, + { + "epoch": 0.2, + "grad_norm": 8.650010453484125, + "learning_rate": 9.298472410593013e-06, + "loss": 0.2831, + "step": 506 + }, + { + "epoch": 0.2, + "grad_norm": 6.519307312442743, + "learning_rate": 9.295269771849426e-06, + "loss": 0.4835, + "step": 507 + }, + { + "epoch": 0.2, + "grad_norm": 5.999356502951759, + "learning_rate": 9.292060393491122e-06, + "loss": 0.4484, + "step": 508 + }, + { + "epoch": 0.2, + "grad_norm": 5.313726471258957, + "learning_rate": 9.288844280553869e-06, + "loss": 0.1948, + "step": 509 + }, + { + "epoch": 0.2, + "grad_norm": 6.714006463761147, + "learning_rate": 9.285621438083997e-06, + "loss": 0.5978, + "step": 510 + }, + { + "epoch": 0.2, + "grad_norm": 5.379507615700409, + "learning_rate": 9.282391871138404e-06, + "loss": 0.4984, + "step": 511 + }, + { + "epoch": 0.2, + "grad_norm": 4.658570973017829, + "learning_rate": 9.279155584784524e-06, + "loss": 0.4057, + "step": 512 + }, + { + "epoch": 0.2, + "grad_norm": 6.500958756680587, + "learning_rate": 9.275912584100353e-06, + "loss": 0.6567, + "step": 513 + }, + { + "epoch": 0.2, + "grad_norm": 4.310159285161232, + "learning_rate": 9.27266287417441e-06, + "loss": 0.3668, + "step": 514 + }, + { + "epoch": 0.2, + "grad_norm": 4.831937675268848, + "learning_rate": 9.269406460105742e-06, + "loss": 0.5193, + "step": 515 + }, + { + "epoch": 0.2, + "grad_norm": 7.099515238410736, + "learning_rate": 9.26614334700392e-06, + "loss": 0.8194, + "step": 516 + }, + { + "epoch": 0.2, + "grad_norm": 5.3368859479351745, + "learning_rate": 9.262873539989023e-06, + "loss": 0.5705, + "step": 517 + }, + { + "epoch": 0.2, + "grad_norm": 4.284847397211024, + "learning_rate": 9.259597044191635e-06, + "loss": 0.2042, + "step": 518 + }, + { + "epoch": 0.2, + "grad_norm": 6.735916668945992, + "learning_rate": 9.256313864752838e-06, + "loss": 0.4556, + "step": 519 + }, + { + "epoch": 0.2, + "grad_norm": 6.0754002428268885, + "learning_rate": 9.25302400682419e-06, + "loss": 0.6233, + "step": 520 + }, + { + "epoch": 0.2, + "grad_norm": 3.856899048001975, + "learning_rate": 9.249727475567742e-06, + "loss": 0.1953, + "step": 521 + }, + { + "epoch": 0.2, + "grad_norm": 5.643048611312609, + "learning_rate": 9.246424276156008e-06, + "loss": 0.3284, + "step": 522 + }, + { + "epoch": 0.2, + "grad_norm": 6.959454034089729, + "learning_rate": 9.243114413771966e-06, + "loss": 0.4909, + "step": 523 + }, + { + "epoch": 0.2, + "grad_norm": 5.911583548587121, + "learning_rate": 9.23979789360905e-06, + "loss": 0.552, + "step": 524 + }, + { + "epoch": 0.2, + "grad_norm": 6.161282668470586, + "learning_rate": 9.23647472087114e-06, + "loss": 0.7181, + "step": 525 + }, + { + "epoch": 0.2, + "grad_norm": 5.465134368035077, + "learning_rate": 9.233144900772553e-06, + "loss": 0.5358, + "step": 526 + }, + { + "epoch": 0.2, + "grad_norm": 5.286736045944329, + "learning_rate": 9.22980843853804e-06, + "loss": 0.4559, + "step": 527 + }, + { + "epoch": 0.2, + "grad_norm": 6.364453867411805, + "learning_rate": 9.226465339402768e-06, + "loss": 0.5925, + "step": 528 + }, + { + "epoch": 0.2, + "grad_norm": 4.097684020659741, + "learning_rate": 9.223115608612325e-06, + "loss": 0.2044, + "step": 529 + }, + { + "epoch": 0.2, + "grad_norm": 11.097457814626292, + "learning_rate": 9.2197592514227e-06, + "loss": 0.5472, + "step": 530 + }, + { + "epoch": 0.21, + "grad_norm": 6.004579623578877, + "learning_rate": 9.21639627310028e-06, + "loss": 0.6041, + "step": 531 + }, + { + "epoch": 0.21, + "grad_norm": 4.450798269783079, + "learning_rate": 9.21302667892184e-06, + "loss": 0.2549, + "step": 532 + }, + { + "epoch": 0.21, + "grad_norm": 4.67869819466794, + "learning_rate": 9.209650474174539e-06, + "loss": 0.4604, + "step": 533 + }, + { + "epoch": 0.21, + "grad_norm": 5.801778106016594, + "learning_rate": 9.206267664155906e-06, + "loss": 0.316, + "step": 534 + }, + { + "epoch": 0.21, + "grad_norm": 7.933585701980946, + "learning_rate": 9.202878254173836e-06, + "loss": 0.531, + "step": 535 + }, + { + "epoch": 0.21, + "grad_norm": 4.78864179174119, + "learning_rate": 9.199482249546577e-06, + "loss": 0.2779, + "step": 536 + }, + { + "epoch": 0.21, + "grad_norm": 5.615827418976137, + "learning_rate": 9.196079655602728e-06, + "loss": 0.2057, + "step": 537 + }, + { + "epoch": 0.21, + "grad_norm": 4.815998281078128, + "learning_rate": 9.192670477681224e-06, + "loss": 0.5645, + "step": 538 + }, + { + "epoch": 0.21, + "grad_norm": 6.335007672245006, + "learning_rate": 9.189254721131333e-06, + "loss": 0.7056, + "step": 539 + }, + { + "epoch": 0.21, + "grad_norm": 4.796413724050597, + "learning_rate": 9.185832391312644e-06, + "loss": 0.2466, + "step": 540 + }, + { + "epoch": 0.21, + "grad_norm": 6.5791002306653334, + "learning_rate": 9.18240349359506e-06, + "loss": 0.2612, + "step": 541 + }, + { + "epoch": 0.21, + "grad_norm": 8.328801436376093, + "learning_rate": 9.178968033358792e-06, + "loss": 0.6004, + "step": 542 + }, + { + "epoch": 0.21, + "grad_norm": 4.747678480571594, + "learning_rate": 9.175526015994345e-06, + "loss": 0.2985, + "step": 543 + }, + { + "epoch": 0.21, + "grad_norm": 4.291349934659031, + "learning_rate": 9.172077446902515e-06, + "loss": 0.3563, + "step": 544 + }, + { + "epoch": 0.21, + "grad_norm": 5.699165337729385, + "learning_rate": 9.168622331494375e-06, + "loss": 0.6731, + "step": 545 + }, + { + "epoch": 0.21, + "grad_norm": 4.760838927131119, + "learning_rate": 9.165160675191272e-06, + "loss": 0.3648, + "step": 546 + }, + { + "epoch": 0.21, + "grad_norm": 4.886185767469909, + "learning_rate": 9.161692483424817e-06, + "loss": 0.3649, + "step": 547 + }, + { + "epoch": 0.21, + "grad_norm": 4.240767179044931, + "learning_rate": 9.158217761636876e-06, + "loss": 0.3731, + "step": 548 + }, + { + "epoch": 0.21, + "grad_norm": 7.291531961236362, + "learning_rate": 9.154736515279557e-06, + "loss": 0.4663, + "step": 549 + }, + { + "epoch": 0.21, + "grad_norm": 5.2866096817589865, + "learning_rate": 9.151248749815208e-06, + "loss": 0.2821, + "step": 550 + }, + { + "epoch": 0.21, + "grad_norm": 5.013726750760507, + "learning_rate": 9.147754470716407e-06, + "loss": 0.3941, + "step": 551 + }, + { + "epoch": 0.21, + "grad_norm": 4.876090507590155, + "learning_rate": 9.144253683465953e-06, + "loss": 0.4349, + "step": 552 + }, + { + "epoch": 0.21, + "grad_norm": 6.415423062861582, + "learning_rate": 9.140746393556853e-06, + "loss": 0.4424, + "step": 553 + }, + { + "epoch": 0.21, + "grad_norm": 4.881403743928506, + "learning_rate": 9.137232606492323e-06, + "loss": 0.2772, + "step": 554 + }, + { + "epoch": 0.21, + "grad_norm": 4.761249797477214, + "learning_rate": 9.133712327785769e-06, + "loss": 0.2298, + "step": 555 + }, + { + "epoch": 0.21, + "grad_norm": 6.054664391701198, + "learning_rate": 9.13018556296078e-06, + "loss": 0.4511, + "step": 556 + }, + { + "epoch": 0.22, + "grad_norm": 4.502707759038403, + "learning_rate": 9.12665231755113e-06, + "loss": 0.4485, + "step": 557 + }, + { + "epoch": 0.22, + "grad_norm": 5.85306690787738, + "learning_rate": 9.123112597100759e-06, + "loss": 0.6611, + "step": 558 + }, + { + "epoch": 0.22, + "grad_norm": 5.943386447646357, + "learning_rate": 9.11956640716376e-06, + "loss": 0.6677, + "step": 559 + }, + { + "epoch": 0.22, + "grad_norm": 4.947155732146871, + "learning_rate": 9.11601375330439e-06, + "loss": 0.4153, + "step": 560 + }, + { + "epoch": 0.22, + "grad_norm": 4.823155051539645, + "learning_rate": 9.112454641097037e-06, + "loss": 0.5055, + "step": 561 + }, + { + "epoch": 0.22, + "grad_norm": 4.88174364914201, + "learning_rate": 9.108889076126226e-06, + "loss": 0.3548, + "step": 562 + }, + { + "epoch": 0.22, + "grad_norm": 5.219007935618746, + "learning_rate": 9.105317063986613e-06, + "loss": 0.4683, + "step": 563 + }, + { + "epoch": 0.22, + "grad_norm": 4.081347804242448, + "learning_rate": 9.101738610282956e-06, + "loss": 0.479, + "step": 564 + }, + { + "epoch": 0.22, + "grad_norm": 5.720049851954457, + "learning_rate": 9.098153720630138e-06, + "loss": 0.3496, + "step": 565 + }, + { + "epoch": 0.22, + "grad_norm": 4.302519555948863, + "learning_rate": 9.094562400653127e-06, + "loss": 0.4195, + "step": 566 + }, + { + "epoch": 0.22, + "grad_norm": 4.908405903770226, + "learning_rate": 9.090964655986985e-06, + "loss": 0.398, + "step": 567 + }, + { + "epoch": 0.22, + "grad_norm": 5.689078199906083, + "learning_rate": 9.087360492276858e-06, + "loss": 0.5135, + "step": 568 + }, + { + "epoch": 0.22, + "grad_norm": 5.765767677497044, + "learning_rate": 9.083749915177959e-06, + "loss": 0.4673, + "step": 569 + }, + { + "epoch": 0.22, + "grad_norm": 5.98511312955719, + "learning_rate": 9.080132930355567e-06, + "loss": 0.8216, + "step": 570 + }, + { + "epoch": 0.22, + "grad_norm": 5.8405831251850655, + "learning_rate": 9.076509543485014e-06, + "loss": 0.5163, + "step": 571 + }, + { + "epoch": 0.22, + "grad_norm": 5.3180793693619846, + "learning_rate": 9.07287976025168e-06, + "loss": 0.3032, + "step": 572 + }, + { + "epoch": 0.22, + "grad_norm": 4.223416951598585, + "learning_rate": 9.069243586350976e-06, + "loss": 0.2761, + "step": 573 + }, + { + "epoch": 0.22, + "grad_norm": 6.461059520813073, + "learning_rate": 9.065601027488345e-06, + "loss": 0.414, + "step": 574 + }, + { + "epoch": 0.22, + "grad_norm": 4.988423555806269, + "learning_rate": 9.061952089379248e-06, + "loss": 0.2759, + "step": 575 + }, + { + "epoch": 0.22, + "grad_norm": 4.741573008092071, + "learning_rate": 9.058296777749154e-06, + "loss": 0.2549, + "step": 576 + }, + { + "epoch": 0.22, + "grad_norm": 7.085477411513353, + "learning_rate": 9.054635098333532e-06, + "loss": 0.309, + "step": 577 + }, + { + "epoch": 0.22, + "grad_norm": 6.374244270142311, + "learning_rate": 9.050967056877846e-06, + "loss": 0.4485, + "step": 578 + }, + { + "epoch": 0.22, + "grad_norm": 6.1420596895146895, + "learning_rate": 9.047292659137542e-06, + "loss": 0.399, + "step": 579 + }, + { + "epoch": 0.22, + "grad_norm": 4.39580420579575, + "learning_rate": 9.043611910878031e-06, + "loss": 0.2966, + "step": 580 + }, + { + "epoch": 0.22, + "grad_norm": 5.542793329823031, + "learning_rate": 9.0399248178747e-06, + "loss": 0.2776, + "step": 581 + }, + { + "epoch": 0.23, + "grad_norm": 5.7841474890677125, + "learning_rate": 9.03623138591289e-06, + "loss": 0.1636, + "step": 582 + }, + { + "epoch": 0.23, + "grad_norm": 5.106091244850837, + "learning_rate": 9.032531620787879e-06, + "loss": 0.597, + "step": 583 + }, + { + "epoch": 0.23, + "grad_norm": 4.9611394452724635, + "learning_rate": 9.028825528304892e-06, + "loss": 0.2254, + "step": 584 + }, + { + "epoch": 0.23, + "grad_norm": 4.799929143801151, + "learning_rate": 9.025113114279076e-06, + "loss": 0.5035, + "step": 585 + }, + { + "epoch": 0.23, + "grad_norm": 3.9930514568363513, + "learning_rate": 9.0213943845355e-06, + "loss": 0.3169, + "step": 586 + }, + { + "epoch": 0.23, + "grad_norm": 6.394127381774008, + "learning_rate": 9.017669344909143e-06, + "loss": 0.3502, + "step": 587 + }, + { + "epoch": 0.23, + "grad_norm": 4.6361982202751015, + "learning_rate": 9.013938001244885e-06, + "loss": 0.1503, + "step": 588 + }, + { + "epoch": 0.23, + "grad_norm": 5.883508947746172, + "learning_rate": 9.010200359397495e-06, + "loss": 0.733, + "step": 589 + }, + { + "epoch": 0.23, + "grad_norm": 4.2364534272751335, + "learning_rate": 9.006456425231624e-06, + "loss": 0.5015, + "step": 590 + }, + { + "epoch": 0.23, + "grad_norm": 4.147546378113793, + "learning_rate": 9.002706204621802e-06, + "loss": 0.1173, + "step": 591 + }, + { + "epoch": 0.23, + "grad_norm": 6.456674295259403, + "learning_rate": 8.998949703452418e-06, + "loss": 0.6133, + "step": 592 + }, + { + "epoch": 0.23, + "grad_norm": 4.7048223122125234, + "learning_rate": 8.995186927617714e-06, + "loss": 0.5821, + "step": 593 + }, + { + "epoch": 0.23, + "grad_norm": 4.221477340291791, + "learning_rate": 8.99141788302178e-06, + "loss": 0.5367, + "step": 594 + }, + { + "epoch": 0.23, + "grad_norm": 7.853876499204625, + "learning_rate": 8.987642575578546e-06, + "loss": 0.2262, + "step": 595 + }, + { + "epoch": 0.23, + "grad_norm": 4.851521493110232, + "learning_rate": 8.98386101121176e-06, + "loss": 0.3634, + "step": 596 + }, + { + "epoch": 0.23, + "grad_norm": 4.793360927811436, + "learning_rate": 8.980073195854999e-06, + "loss": 0.3085, + "step": 597 + }, + { + "epoch": 0.23, + "grad_norm": 5.684086790765379, + "learning_rate": 8.976279135451636e-06, + "loss": 0.4492, + "step": 598 + }, + { + "epoch": 0.23, + "grad_norm": 5.406048209770403, + "learning_rate": 8.972478835954851e-06, + "loss": 0.4793, + "step": 599 + }, + { + "epoch": 0.23, + "grad_norm": 7.532470906160693, + "learning_rate": 8.968672303327614e-06, + "loss": 0.3609, + "step": 600 + }, + { + "epoch": 0.23, + "grad_norm": 5.321851260805017, + "learning_rate": 8.96485954354267e-06, + "loss": 0.4429, + "step": 601 + }, + { + "epoch": 0.23, + "grad_norm": 5.585618498670008, + "learning_rate": 8.96104056258254e-06, + "loss": 0.6076, + "step": 602 + }, + { + "epoch": 0.23, + "grad_norm": 4.350156432156571, + "learning_rate": 8.9572153664395e-06, + "loss": 0.3776, + "step": 603 + }, + { + "epoch": 0.23, + "grad_norm": 4.554467816095734, + "learning_rate": 8.953383961115586e-06, + "loss": 0.3746, + "step": 604 + }, + { + "epoch": 0.23, + "grad_norm": 3.778674710491944, + "learning_rate": 8.949546352622573e-06, + "loss": 0.3309, + "step": 605 + }, + { + "epoch": 0.23, + "grad_norm": 3.875920570672478, + "learning_rate": 8.94570254698197e-06, + "loss": 0.5326, + "step": 606 + }, + { + "epoch": 0.23, + "grad_norm": 5.69369852448707, + "learning_rate": 8.941852550225005e-06, + "loss": 0.4415, + "step": 607 + }, + { + "epoch": 0.24, + "grad_norm": 3.720349321793076, + "learning_rate": 8.937996368392629e-06, + "loss": 0.1353, + "step": 608 + }, + { + "epoch": 0.24, + "grad_norm": 7.1578710796940594, + "learning_rate": 8.93413400753549e-06, + "loss": 0.4838, + "step": 609 + }, + { + "epoch": 0.24, + "grad_norm": 4.3784366525516445, + "learning_rate": 8.930265473713939e-06, + "loss": 0.4246, + "step": 610 + }, + { + "epoch": 0.24, + "grad_norm": 5.811956853801444, + "learning_rate": 8.926390772998004e-06, + "loss": 0.332, + "step": 611 + }, + { + "epoch": 0.24, + "grad_norm": 5.116776780546333, + "learning_rate": 8.922509911467395e-06, + "loss": 0.503, + "step": 612 + }, + { + "epoch": 0.24, + "grad_norm": 7.233705981168302, + "learning_rate": 8.91862289521149e-06, + "loss": 0.5296, + "step": 613 + }, + { + "epoch": 0.24, + "grad_norm": 7.83131844562604, + "learning_rate": 8.914729730329321e-06, + "loss": 0.5964, + "step": 614 + }, + { + "epoch": 0.24, + "grad_norm": 3.5487472129396322, + "learning_rate": 8.910830422929566e-06, + "loss": 0.1257, + "step": 615 + }, + { + "epoch": 0.24, + "grad_norm": 4.976462171343212, + "learning_rate": 8.906924979130548e-06, + "loss": 0.2292, + "step": 616 + }, + { + "epoch": 0.24, + "grad_norm": 6.125082454484557, + "learning_rate": 8.903013405060212e-06, + "loss": 0.2502, + "step": 617 + }, + { + "epoch": 0.24, + "grad_norm": 5.2103922468222414, + "learning_rate": 8.899095706856122e-06, + "loss": 0.6488, + "step": 618 + }, + { + "epoch": 0.24, + "grad_norm": 5.563943064276356, + "learning_rate": 8.895171890665457e-06, + "loss": 0.3041, + "step": 619 + }, + { + "epoch": 0.24, + "grad_norm": 4.90549608756185, + "learning_rate": 8.891241962644992e-06, + "loss": 0.3908, + "step": 620 + }, + { + "epoch": 0.24, + "grad_norm": 4.5647758747705645, + "learning_rate": 8.887305928961088e-06, + "loss": 0.2606, + "step": 621 + }, + { + "epoch": 0.24, + "grad_norm": 4.338540588701125, + "learning_rate": 8.883363795789694e-06, + "loss": 0.4628, + "step": 622 + }, + { + "epoch": 0.24, + "grad_norm": 4.989041661991694, + "learning_rate": 8.879415569316324e-06, + "loss": 0.3614, + "step": 623 + }, + { + "epoch": 0.24, + "grad_norm": 5.72503531576978, + "learning_rate": 8.875461255736055e-06, + "loss": 0.6349, + "step": 624 + }, + { + "epoch": 0.24, + "grad_norm": 5.99429370355517, + "learning_rate": 8.871500861253515e-06, + "loss": 0.6988, + "step": 625 + }, + { + "epoch": 0.24, + "grad_norm": 6.741269406283492, + "learning_rate": 8.867534392082873e-06, + "loss": 0.3059, + "step": 626 + }, + { + "epoch": 0.24, + "grad_norm": 7.082198357748838, + "learning_rate": 8.863561854447829e-06, + "loss": 0.3409, + "step": 627 + }, + { + "epoch": 0.24, + "grad_norm": 4.886963998552751, + "learning_rate": 8.859583254581604e-06, + "loss": 0.2079, + "step": 628 + }, + { + "epoch": 0.24, + "grad_norm": 5.777052226164107, + "learning_rate": 8.85559859872694e-06, + "loss": 0.6578, + "step": 629 + }, + { + "epoch": 0.24, + "grad_norm": 7.477167421114731, + "learning_rate": 8.851607893136065e-06, + "loss": 0.691, + "step": 630 + }, + { + "epoch": 0.24, + "grad_norm": 8.719875740447138, + "learning_rate": 8.847611144070716e-06, + "loss": 0.6848, + "step": 631 + }, + { + "epoch": 0.24, + "grad_norm": 6.034115251265882, + "learning_rate": 8.8436083578021e-06, + "loss": 0.4325, + "step": 632 + }, + { + "epoch": 0.24, + "grad_norm": 6.372608770075029, + "learning_rate": 8.839599540610906e-06, + "loss": 0.3621, + "step": 633 + }, + { + "epoch": 0.25, + "grad_norm": 5.232505799557821, + "learning_rate": 8.83558469878728e-06, + "loss": 0.2291, + "step": 634 + }, + { + "epoch": 0.25, + "grad_norm": 6.288205315432176, + "learning_rate": 8.831563838630825e-06, + "loss": 0.5354, + "step": 635 + }, + { + "epoch": 0.25, + "grad_norm": 5.384007929077674, + "learning_rate": 8.827536966450584e-06, + "loss": 0.5507, + "step": 636 + }, + { + "epoch": 0.25, + "grad_norm": 6.510150381590252, + "learning_rate": 8.823504088565035e-06, + "loss": 0.2524, + "step": 637 + }, + { + "epoch": 0.25, + "grad_norm": 11.303322110036294, + "learning_rate": 8.819465211302081e-06, + "loss": 0.674, + "step": 638 + }, + { + "epoch": 0.25, + "grad_norm": 5.999983164193584, + "learning_rate": 8.815420340999034e-06, + "loss": 0.2924, + "step": 639 + }, + { + "epoch": 0.25, + "grad_norm": 6.12093234982364, + "learning_rate": 8.811369484002614e-06, + "loss": 0.3968, + "step": 640 + }, + { + "epoch": 0.25, + "grad_norm": 6.619460408812439, + "learning_rate": 8.807312646668932e-06, + "loss": 0.4993, + "step": 641 + }, + { + "epoch": 0.25, + "grad_norm": 5.934053692341061, + "learning_rate": 8.803249835363486e-06, + "loss": 0.2572, + "step": 642 + }, + { + "epoch": 0.25, + "grad_norm": 6.30826448851593, + "learning_rate": 8.799181056461143e-06, + "loss": 0.4038, + "step": 643 + }, + { + "epoch": 0.25, + "grad_norm": 4.47423916678203, + "learning_rate": 8.795106316346136e-06, + "loss": 0.4849, + "step": 644 + }, + { + "epoch": 0.25, + "grad_norm": 4.466568199114376, + "learning_rate": 8.791025621412052e-06, + "loss": 0.332, + "step": 645 + }, + { + "epoch": 0.25, + "grad_norm": 5.243633106944956, + "learning_rate": 8.78693897806182e-06, + "loss": 0.3374, + "step": 646 + }, + { + "epoch": 0.25, + "grad_norm": 5.424150262851797, + "learning_rate": 8.782846392707704e-06, + "loss": 0.3594, + "step": 647 + }, + { + "epoch": 0.25, + "grad_norm": 4.45245317318313, + "learning_rate": 8.778747871771293e-06, + "loss": 0.2824, + "step": 648 + }, + { + "epoch": 0.25, + "grad_norm": 7.324510289911242, + "learning_rate": 8.774643421683484e-06, + "loss": 0.3548, + "step": 649 + }, + { + "epoch": 0.25, + "grad_norm": 4.708580457516981, + "learning_rate": 8.770533048884483e-06, + "loss": 0.326, + "step": 650 + }, + { + "epoch": 0.25, + "grad_norm": 5.074061664096671, + "learning_rate": 8.766416759823785e-06, + "loss": 0.3699, + "step": 651 + }, + { + "epoch": 0.25, + "grad_norm": 3.7163699528315157, + "learning_rate": 8.762294560960173e-06, + "loss": 0.1294, + "step": 652 + }, + { + "epoch": 0.25, + "grad_norm": 8.540201543268772, + "learning_rate": 8.758166458761695e-06, + "loss": 0.3724, + "step": 653 + }, + { + "epoch": 0.25, + "grad_norm": 6.112779353599681, + "learning_rate": 8.754032459705672e-06, + "loss": 0.2772, + "step": 654 + }, + { + "epoch": 0.25, + "grad_norm": 4.368645623986805, + "learning_rate": 8.74989257027867e-06, + "loss": 0.2362, + "step": 655 + }, + { + "epoch": 0.25, + "grad_norm": 4.123496368634227, + "learning_rate": 8.7457467969765e-06, + "loss": 0.2468, + "step": 656 + }, + { + "epoch": 0.25, + "grad_norm": 5.020379051578081, + "learning_rate": 8.741595146304202e-06, + "loss": 0.3748, + "step": 657 + }, + { + "epoch": 0.25, + "grad_norm": 3.2442462447122233, + "learning_rate": 8.737437624776047e-06, + "loss": 0.1612, + "step": 658 + }, + { + "epoch": 0.25, + "grad_norm": 4.990264060435372, + "learning_rate": 8.733274238915508e-06, + "loss": 0.4543, + "step": 659 + }, + { + "epoch": 0.26, + "grad_norm": 4.249144903142453, + "learning_rate": 8.729104995255265e-06, + "loss": 0.1474, + "step": 660 + }, + { + "epoch": 0.26, + "grad_norm": 4.613450551881159, + "learning_rate": 8.724929900337186e-06, + "loss": 0.3388, + "step": 661 + }, + { + "epoch": 0.26, + "grad_norm": 5.822050202241509, + "learning_rate": 8.720748960712323e-06, + "loss": 0.6221, + "step": 662 + }, + { + "epoch": 0.26, + "grad_norm": 4.740105180638977, + "learning_rate": 8.7165621829409e-06, + "loss": 0.2057, + "step": 663 + }, + { + "epoch": 0.26, + "grad_norm": 4.9460178659084795, + "learning_rate": 8.712369573592296e-06, + "loss": 0.3938, + "step": 664 + }, + { + "epoch": 0.26, + "grad_norm": 4.31040494345963, + "learning_rate": 8.708171139245045e-06, + "loss": 0.2213, + "step": 665 + }, + { + "epoch": 0.26, + "grad_norm": 5.788913480644609, + "learning_rate": 8.703966886486819e-06, + "loss": 0.3471, + "step": 666 + }, + { + "epoch": 0.26, + "grad_norm": 5.545026373357742, + "learning_rate": 8.69975682191442e-06, + "loss": 0.6506, + "step": 667 + }, + { + "epoch": 0.26, + "grad_norm": 5.344671011113987, + "learning_rate": 8.695540952133769e-06, + "loss": 0.3016, + "step": 668 + }, + { + "epoch": 0.26, + "grad_norm": 4.303114281176954, + "learning_rate": 8.691319283759896e-06, + "loss": 0.5347, + "step": 669 + }, + { + "epoch": 0.26, + "grad_norm": 4.723666597161375, + "learning_rate": 8.68709182341693e-06, + "loss": 0.5101, + "step": 670 + }, + { + "epoch": 0.26, + "grad_norm": 5.085242528833489, + "learning_rate": 8.682858577738086e-06, + "loss": 0.3768, + "step": 671 + }, + { + "epoch": 0.26, + "grad_norm": 3.779620333726095, + "learning_rate": 8.67861955336566e-06, + "loss": 0.5043, + "step": 672 + }, + { + "epoch": 0.26, + "grad_norm": 5.687776970224553, + "learning_rate": 8.67437475695101e-06, + "loss": 0.4646, + "step": 673 + }, + { + "epoch": 0.26, + "grad_norm": 5.345313862016524, + "learning_rate": 8.670124195154557e-06, + "loss": 0.2227, + "step": 674 + }, + { + "epoch": 0.26, + "grad_norm": 6.169312047845613, + "learning_rate": 8.665867874645767e-06, + "loss": 0.3778, + "step": 675 + }, + { + "epoch": 0.26, + "grad_norm": 4.425793391537271, + "learning_rate": 8.661605802103134e-06, + "loss": 0.7285, + "step": 676 + }, + { + "epoch": 0.26, + "grad_norm": 5.659876500909084, + "learning_rate": 8.657337984214189e-06, + "loss": 0.2796, + "step": 677 + }, + { + "epoch": 0.26, + "grad_norm": 4.009208184545386, + "learning_rate": 8.65306442767547e-06, + "loss": 0.1256, + "step": 678 + }, + { + "epoch": 0.26, + "grad_norm": 5.0990976733828735, + "learning_rate": 8.648785139192526e-06, + "loss": 0.4689, + "step": 679 + }, + { + "epoch": 0.26, + "grad_norm": 6.224789766419377, + "learning_rate": 8.644500125479891e-06, + "loss": 0.5291, + "step": 680 + }, + { + "epoch": 0.26, + "grad_norm": 5.694503032236168, + "learning_rate": 8.640209393261087e-06, + "loss": 0.4087, + "step": 681 + }, + { + "epoch": 0.26, + "grad_norm": 6.260352448355556, + "learning_rate": 8.635912949268614e-06, + "loss": 0.5022, + "step": 682 + }, + { + "epoch": 0.26, + "grad_norm": 5.0785051869218645, + "learning_rate": 8.631610800243926e-06, + "loss": 0.5318, + "step": 683 + }, + { + "epoch": 0.26, + "grad_norm": 6.480147091940225, + "learning_rate": 8.627302952937431e-06, + "loss": 0.3168, + "step": 684 + }, + { + "epoch": 0.26, + "grad_norm": 5.3855043647303855, + "learning_rate": 8.622989414108479e-06, + "loss": 0.4281, + "step": 685 + }, + { + "epoch": 0.27, + "grad_norm": 5.772660443031573, + "learning_rate": 8.61867019052535e-06, + "loss": 0.3886, + "step": 686 + }, + { + "epoch": 0.27, + "grad_norm": 4.552568074182524, + "learning_rate": 8.614345288965249e-06, + "loss": 0.6878, + "step": 687 + }, + { + "epoch": 0.27, + "grad_norm": 6.074780148567725, + "learning_rate": 8.61001471621428e-06, + "loss": 0.381, + "step": 688 + }, + { + "epoch": 0.27, + "grad_norm": 7.885979991047838, + "learning_rate": 8.60567847906745e-06, + "loss": 0.1815, + "step": 689 + }, + { + "epoch": 0.27, + "grad_norm": 6.654217815521764, + "learning_rate": 8.601336584328659e-06, + "loss": 0.7389, + "step": 690 + }, + { + "epoch": 0.27, + "grad_norm": 5.137268516797368, + "learning_rate": 8.596989038810678e-06, + "loss": 0.572, + "step": 691 + }, + { + "epoch": 0.27, + "grad_norm": 8.729037352933048, + "learning_rate": 8.592635849335148e-06, + "loss": 0.4381, + "step": 692 + }, + { + "epoch": 0.27, + "grad_norm": 3.449555865086533, + "learning_rate": 8.58827702273256e-06, + "loss": 0.3368, + "step": 693 + }, + { + "epoch": 0.27, + "grad_norm": 4.562912791658195, + "learning_rate": 8.583912565842258e-06, + "loss": 0.5002, + "step": 694 + }, + { + "epoch": 0.27, + "grad_norm": 5.649673286196971, + "learning_rate": 8.579542485512416e-06, + "loss": 0.4945, + "step": 695 + }, + { + "epoch": 0.27, + "grad_norm": 5.18833215378799, + "learning_rate": 8.575166788600031e-06, + "loss": 0.4533, + "step": 696 + }, + { + "epoch": 0.27, + "grad_norm": 6.547724219921325, + "learning_rate": 8.570785481970915e-06, + "loss": 0.262, + "step": 697 + }, + { + "epoch": 0.27, + "grad_norm": 6.29980581561246, + "learning_rate": 8.566398572499685e-06, + "loss": 0.4902, + "step": 698 + }, + { + "epoch": 0.27, + "grad_norm": 4.632492170796724, + "learning_rate": 8.56200606706974e-06, + "loss": 0.4435, + "step": 699 + }, + { + "epoch": 0.27, + "grad_norm": 5.737027857054075, + "learning_rate": 8.557607972573267e-06, + "loss": 0.4521, + "step": 700 + }, + { + "epoch": 0.27, + "grad_norm": 5.080692048952784, + "learning_rate": 8.553204295911222e-06, + "loss": 0.4275, + "step": 701 + }, + { + "epoch": 0.27, + "grad_norm": 4.3992850488391335, + "learning_rate": 8.548795043993316e-06, + "loss": 0.1881, + "step": 702 + }, + { + "epoch": 0.27, + "grad_norm": 5.39630771421547, + "learning_rate": 8.544380223738015e-06, + "loss": 0.4196, + "step": 703 + }, + { + "epoch": 0.27, + "grad_norm": 5.406526853578896, + "learning_rate": 8.539959842072513e-06, + "loss": 0.4037, + "step": 704 + }, + { + "epoch": 0.27, + "grad_norm": 4.71538778598701, + "learning_rate": 8.535533905932739e-06, + "loss": 0.522, + "step": 705 + }, + { + "epoch": 0.27, + "grad_norm": 5.148550809802702, + "learning_rate": 8.53110242226333e-06, + "loss": 0.2433, + "step": 706 + }, + { + "epoch": 0.27, + "grad_norm": 4.835256793033029, + "learning_rate": 8.526665398017633e-06, + "loss": 0.2493, + "step": 707 + }, + { + "epoch": 0.27, + "grad_norm": 4.771566456752833, + "learning_rate": 8.522222840157687e-06, + "loss": 0.462, + "step": 708 + }, + { + "epoch": 0.27, + "grad_norm": 4.4112013405040225, + "learning_rate": 8.517774755654212e-06, + "loss": 0.5426, + "step": 709 + }, + { + "epoch": 0.27, + "grad_norm": 6.476116214038771, + "learning_rate": 8.513321151486602e-06, + "loss": 0.3139, + "step": 710 + }, + { + "epoch": 0.27, + "grad_norm": 6.105784418844728, + "learning_rate": 8.508862034642909e-06, + "loss": 0.4809, + "step": 711 + }, + { + "epoch": 0.28, + "grad_norm": 4.956245030040778, + "learning_rate": 8.504397412119838e-06, + "loss": 0.4856, + "step": 712 + }, + { + "epoch": 0.28, + "grad_norm": 4.353146817162132, + "learning_rate": 8.499927290922734e-06, + "loss": 0.2731, + "step": 713 + }, + { + "epoch": 0.28, + "grad_norm": 4.387553795360139, + "learning_rate": 8.495451678065563e-06, + "loss": 0.3675, + "step": 714 + }, + { + "epoch": 0.28, + "grad_norm": 5.9716862870401, + "learning_rate": 8.490970580570912e-06, + "loss": 0.3146, + "step": 715 + }, + { + "epoch": 0.28, + "grad_norm": 4.465889783287375, + "learning_rate": 8.486484005469977e-06, + "loss": 0.3151, + "step": 716 + }, + { + "epoch": 0.28, + "grad_norm": 9.457237970712992, + "learning_rate": 8.481991959802546e-06, + "loss": 0.5893, + "step": 717 + }, + { + "epoch": 0.28, + "grad_norm": 5.08714400356136, + "learning_rate": 8.477494450616988e-06, + "loss": 0.317, + "step": 718 + }, + { + "epoch": 0.28, + "grad_norm": 6.453397786116955, + "learning_rate": 8.472991484970247e-06, + "loss": 0.3618, + "step": 719 + }, + { + "epoch": 0.28, + "grad_norm": 6.930592776162307, + "learning_rate": 8.468483069927832e-06, + "loss": 0.4635, + "step": 720 + }, + { + "epoch": 0.28, + "grad_norm": 4.92336479181062, + "learning_rate": 8.463969212563796e-06, + "loss": 0.3627, + "step": 721 + }, + { + "epoch": 0.28, + "grad_norm": 5.583733544579457, + "learning_rate": 8.459449919960737e-06, + "loss": 0.4629, + "step": 722 + }, + { + "epoch": 0.28, + "grad_norm": 5.239532845260946, + "learning_rate": 8.454925199209778e-06, + "loss": 0.4101, + "step": 723 + }, + { + "epoch": 0.28, + "grad_norm": 3.3101666129775786, + "learning_rate": 8.450395057410561e-06, + "loss": 0.2588, + "step": 724 + }, + { + "epoch": 0.28, + "grad_norm": 5.694791777010421, + "learning_rate": 8.445859501671232e-06, + "loss": 0.3593, + "step": 725 + }, + { + "epoch": 0.28, + "grad_norm": 4.294140648416576, + "learning_rate": 8.441318539108433e-06, + "loss": 0.1859, + "step": 726 + }, + { + "epoch": 0.28, + "grad_norm": 5.5982421619702745, + "learning_rate": 8.436772176847295e-06, + "loss": 0.5006, + "step": 727 + }, + { + "epoch": 0.28, + "grad_norm": 4.32551203191427, + "learning_rate": 8.432220422021408e-06, + "loss": 0.3533, + "step": 728 + }, + { + "epoch": 0.28, + "grad_norm": 4.7729572569482865, + "learning_rate": 8.42766328177284e-06, + "loss": 0.2034, + "step": 729 + }, + { + "epoch": 0.28, + "grad_norm": 4.60106757513709, + "learning_rate": 8.423100763252094e-06, + "loss": 0.2384, + "step": 730 + }, + { + "epoch": 0.28, + "grad_norm": 6.341121812752496, + "learning_rate": 8.418532873618125e-06, + "loss": 0.3294, + "step": 731 + }, + { + "epoch": 0.28, + "grad_norm": 4.390056947750818, + "learning_rate": 8.413959620038306e-06, + "loss": 0.4776, + "step": 732 + }, + { + "epoch": 0.28, + "grad_norm": 4.716304747185911, + "learning_rate": 8.409381009688431e-06, + "loss": 0.4473, + "step": 733 + }, + { + "epoch": 0.28, + "grad_norm": 6.077923171897502, + "learning_rate": 8.404797049752697e-06, + "loss": 0.525, + "step": 734 + }, + { + "epoch": 0.28, + "grad_norm": 4.90524035318987, + "learning_rate": 8.4002077474237e-06, + "loss": 0.3571, + "step": 735 + }, + { + "epoch": 0.28, + "grad_norm": 5.353718092364014, + "learning_rate": 8.39561310990241e-06, + "loss": 0.6481, + "step": 736 + }, + { + "epoch": 0.28, + "grad_norm": 6.2212785232108425, + "learning_rate": 8.39101314439818e-06, + "loss": 0.8407, + "step": 737 + }, + { + "epoch": 0.29, + "grad_norm": 4.632940916977337, + "learning_rate": 8.386407858128707e-06, + "loss": 0.3278, + "step": 738 + }, + { + "epoch": 0.29, + "grad_norm": 8.421823200834819, + "learning_rate": 8.38179725832005e-06, + "loss": 0.4727, + "step": 739 + }, + { + "epoch": 0.29, + "grad_norm": 4.251013698044102, + "learning_rate": 8.377181352206604e-06, + "loss": 0.4168, + "step": 740 + }, + { + "epoch": 0.29, + "grad_norm": 4.5407855751458355, + "learning_rate": 8.372560147031087e-06, + "loss": 0.4087, + "step": 741 + }, + { + "epoch": 0.29, + "grad_norm": 6.221369280283807, + "learning_rate": 8.367933650044526e-06, + "loss": 0.2892, + "step": 742 + }, + { + "epoch": 0.29, + "grad_norm": 5.308213092755246, + "learning_rate": 8.363301868506264e-06, + "loss": 0.2703, + "step": 743 + }, + { + "epoch": 0.29, + "grad_norm": 4.980934617217376, + "learning_rate": 8.358664809683926e-06, + "loss": 0.5673, + "step": 744 + }, + { + "epoch": 0.29, + "grad_norm": 6.1931510638054945, + "learning_rate": 8.354022480853418e-06, + "loss": 0.5605, + "step": 745 + }, + { + "epoch": 0.29, + "grad_norm": 4.117437936432802, + "learning_rate": 8.349374889298923e-06, + "loss": 0.1584, + "step": 746 + }, + { + "epoch": 0.29, + "grad_norm": 6.136085182252913, + "learning_rate": 8.344722042312872e-06, + "loss": 0.4435, + "step": 747 + }, + { + "epoch": 0.29, + "grad_norm": 5.550301029816411, + "learning_rate": 8.340063947195947e-06, + "loss": 0.2945, + "step": 748 + }, + { + "epoch": 0.29, + "grad_norm": 8.800424006474591, + "learning_rate": 8.335400611257067e-06, + "loss": 0.4362, + "step": 749 + }, + { + "epoch": 0.29, + "grad_norm": 9.160866290111288, + "learning_rate": 8.330732041813367e-06, + "loss": 0.5082, + "step": 750 + }, + { + "epoch": 0.29, + "grad_norm": 4.143961468455159, + "learning_rate": 8.326058246190202e-06, + "loss": 0.3298, + "step": 751 + }, + { + "epoch": 0.29, + "grad_norm": 4.360607497775467, + "learning_rate": 8.321379231721123e-06, + "loss": 0.336, + "step": 752 + }, + { + "epoch": 0.29, + "grad_norm": 4.934394227353047, + "learning_rate": 8.316695005747866e-06, + "loss": 0.5735, + "step": 753 + }, + { + "epoch": 0.29, + "grad_norm": 8.161434531278339, + "learning_rate": 8.312005575620355e-06, + "loss": 0.3328, + "step": 754 + }, + { + "epoch": 0.29, + "grad_norm": 4.371996663426846, + "learning_rate": 8.307310948696667e-06, + "loss": 0.426, + "step": 755 + }, + { + "epoch": 0.29, + "grad_norm": 4.347178597445798, + "learning_rate": 8.302611132343042e-06, + "loss": 0.3719, + "step": 756 + }, + { + "epoch": 0.29, + "grad_norm": 7.302882867714401, + "learning_rate": 8.297906133933861e-06, + "loss": 0.1944, + "step": 757 + }, + { + "epoch": 0.29, + "grad_norm": 7.77277910734547, + "learning_rate": 8.293195960851634e-06, + "loss": 0.5761, + "step": 758 + }, + { + "epoch": 0.29, + "grad_norm": 5.644844069972676, + "learning_rate": 8.288480620486991e-06, + "loss": 0.5338, + "step": 759 + }, + { + "epoch": 0.29, + "grad_norm": 3.6321830270740136, + "learning_rate": 8.283760120238672e-06, + "loss": 0.1059, + "step": 760 + }, + { + "epoch": 0.29, + "grad_norm": 4.461153624018655, + "learning_rate": 8.27903446751351e-06, + "loss": 0.1658, + "step": 761 + }, + { + "epoch": 0.29, + "grad_norm": 11.673090789730939, + "learning_rate": 8.274303669726427e-06, + "loss": 0.3966, + "step": 762 + }, + { + "epoch": 0.29, + "grad_norm": 3.756068482704599, + "learning_rate": 8.26956773430041e-06, + "loss": 0.2664, + "step": 763 + }, + { + "epoch": 0.3, + "grad_norm": 5.00523516311952, + "learning_rate": 8.264826668666516e-06, + "loss": 0.3951, + "step": 764 + }, + { + "epoch": 0.3, + "grad_norm": 3.5683192166863535, + "learning_rate": 8.26008048026385e-06, + "loss": 0.1858, + "step": 765 + }, + { + "epoch": 0.3, + "grad_norm": 5.365757277166145, + "learning_rate": 8.255329176539552e-06, + "loss": 0.3197, + "step": 766 + }, + { + "epoch": 0.3, + "grad_norm": 7.54615438593096, + "learning_rate": 8.250572764948787e-06, + "loss": 0.6379, + "step": 767 + }, + { + "epoch": 0.3, + "grad_norm": 5.341950067856082, + "learning_rate": 8.245811252954741e-06, + "loss": 0.4991, + "step": 768 + }, + { + "epoch": 0.3, + "grad_norm": 5.890676412538074, + "learning_rate": 8.241044648028597e-06, + "loss": 0.6154, + "step": 769 + }, + { + "epoch": 0.3, + "grad_norm": 6.398870850919496, + "learning_rate": 8.236272957649534e-06, + "loss": 0.1698, + "step": 770 + }, + { + "epoch": 0.3, + "grad_norm": 5.32526464985607, + "learning_rate": 8.231496189304704e-06, + "loss": 0.2087, + "step": 771 + }, + { + "epoch": 0.3, + "grad_norm": 5.44216431315945, + "learning_rate": 8.226714350489235e-06, + "loss": 0.4175, + "step": 772 + }, + { + "epoch": 0.3, + "grad_norm": 7.473168102716619, + "learning_rate": 8.221927448706204e-06, + "loss": 0.4919, + "step": 773 + }, + { + "epoch": 0.3, + "grad_norm": 6.995285658195084, + "learning_rate": 8.217135491466636e-06, + "loss": 0.3075, + "step": 774 + }, + { + "epoch": 0.3, + "grad_norm": 5.124387228426363, + "learning_rate": 8.212338486289486e-06, + "loss": 0.4427, + "step": 775 + }, + { + "epoch": 0.3, + "grad_norm": 7.0730998605401, + "learning_rate": 8.207536440701633e-06, + "loss": 0.2747, + "step": 776 + }, + { + "epoch": 0.3, + "grad_norm": 6.207579874395538, + "learning_rate": 8.20272936223786e-06, + "loss": 0.4562, + "step": 777 + }, + { + "epoch": 0.3, + "grad_norm": 5.409820639904443, + "learning_rate": 8.197917258440851e-06, + "loss": 0.4363, + "step": 778 + }, + { + "epoch": 0.3, + "grad_norm": 5.147185631193565, + "learning_rate": 8.193100136861174e-06, + "loss": 0.2087, + "step": 779 + }, + { + "epoch": 0.3, + "grad_norm": 5.644403115592335, + "learning_rate": 8.18827800505727e-06, + "loss": 0.4722, + "step": 780 + }, + { + "epoch": 0.3, + "grad_norm": 5.593727345567647, + "learning_rate": 8.183450870595443e-06, + "loss": 0.4737, + "step": 781 + }, + { + "epoch": 0.3, + "grad_norm": 3.5883233493303393, + "learning_rate": 8.178618741049841e-06, + "loss": 0.1089, + "step": 782 + }, + { + "epoch": 0.3, + "grad_norm": 4.565888525765652, + "learning_rate": 8.173781624002456e-06, + "loss": 0.2804, + "step": 783 + }, + { + "epoch": 0.3, + "grad_norm": 3.90704359502963, + "learning_rate": 8.168939527043104e-06, + "loss": 0.2742, + "step": 784 + }, + { + "epoch": 0.3, + "grad_norm": 6.9667965149128985, + "learning_rate": 8.164092457769415e-06, + "loss": 0.2393, + "step": 785 + }, + { + "epoch": 0.3, + "grad_norm": 5.73666596849711, + "learning_rate": 8.15924042378682e-06, + "loss": 0.4671, + "step": 786 + }, + { + "epoch": 0.3, + "grad_norm": 6.498735388385353, + "learning_rate": 8.15438343270854e-06, + "loss": 0.4297, + "step": 787 + }, + { + "epoch": 0.3, + "grad_norm": 5.297444749293851, + "learning_rate": 8.149521492155573e-06, + "loss": 0.4278, + "step": 788 + }, + { + "epoch": 0.31, + "grad_norm": 4.887902501920645, + "learning_rate": 8.144654609756685e-06, + "loss": 0.386, + "step": 789 + }, + { + "epoch": 0.31, + "grad_norm": 5.53146198817337, + "learning_rate": 8.1397827931484e-06, + "loss": 0.4026, + "step": 790 + }, + { + "epoch": 0.31, + "grad_norm": 4.816985787516275, + "learning_rate": 8.134906049974975e-06, + "loss": 0.4611, + "step": 791 + }, + { + "epoch": 0.31, + "grad_norm": 3.7708813505955527, + "learning_rate": 8.130024387888402e-06, + "loss": 0.2199, + "step": 792 + }, + { + "epoch": 0.31, + "grad_norm": 5.9273358705222225, + "learning_rate": 8.125137814548394e-06, + "loss": 0.6214, + "step": 793 + }, + { + "epoch": 0.31, + "grad_norm": 3.907725612922181, + "learning_rate": 8.120246337622364e-06, + "loss": 0.3915, + "step": 794 + }, + { + "epoch": 0.31, + "grad_norm": 4.279782883297365, + "learning_rate": 8.115349964785425e-06, + "loss": 0.1899, + "step": 795 + }, + { + "epoch": 0.31, + "grad_norm": 4.328073155849916, + "learning_rate": 8.110448703720372e-06, + "loss": 0.4714, + "step": 796 + }, + { + "epoch": 0.31, + "grad_norm": 4.867309032110139, + "learning_rate": 8.105542562117663e-06, + "loss": 0.3014, + "step": 797 + }, + { + "epoch": 0.31, + "grad_norm": 5.471102341195892, + "learning_rate": 8.100631547675417e-06, + "loss": 0.2651, + "step": 798 + }, + { + "epoch": 0.31, + "grad_norm": 11.173289710599521, + "learning_rate": 8.095715668099406e-06, + "loss": 0.2958, + "step": 799 + }, + { + "epoch": 0.31, + "grad_norm": 5.1909467558791595, + "learning_rate": 8.090794931103026e-06, + "loss": 0.333, + "step": 800 + }, + { + "epoch": 0.31, + "grad_norm": 4.606462455050537, + "learning_rate": 8.085869344407301e-06, + "loss": 0.1894, + "step": 801 + }, + { + "epoch": 0.31, + "grad_norm": 5.061123196068857, + "learning_rate": 8.080938915740863e-06, + "loss": 0.4263, + "step": 802 + }, + { + "epoch": 0.31, + "grad_norm": 4.795244573922029, + "learning_rate": 8.076003652839936e-06, + "loss": 0.37, + "step": 803 + }, + { + "epoch": 0.31, + "grad_norm": 4.990154637814864, + "learning_rate": 8.071063563448341e-06, + "loss": 0.3033, + "step": 804 + }, + { + "epoch": 0.31, + "grad_norm": 4.179204271459913, + "learning_rate": 8.066118655317458e-06, + "loss": 0.1661, + "step": 805 + }, + { + "epoch": 0.31, + "grad_norm": 5.254996210451282, + "learning_rate": 8.06116893620624e-06, + "loss": 0.7173, + "step": 806 + }, + { + "epoch": 0.31, + "grad_norm": 6.06120092010497, + "learning_rate": 8.056214413881183e-06, + "loss": 0.577, + "step": 807 + }, + { + "epoch": 0.31, + "grad_norm": 5.733355509595694, + "learning_rate": 8.051255096116322e-06, + "loss": 0.4846, + "step": 808 + }, + { + "epoch": 0.31, + "grad_norm": 5.984245285971206, + "learning_rate": 8.04629099069321e-06, + "loss": 0.4047, + "step": 809 + }, + { + "epoch": 0.31, + "grad_norm": 5.7598327807264855, + "learning_rate": 8.041322105400923e-06, + "loss": 0.5301, + "step": 810 + }, + { + "epoch": 0.31, + "grad_norm": 7.195908506506143, + "learning_rate": 8.036348448036029e-06, + "loss": 0.6084, + "step": 811 + }, + { + "epoch": 0.31, + "grad_norm": 5.35207554435034, + "learning_rate": 8.031370026402585e-06, + "loss": 0.3607, + "step": 812 + }, + { + "epoch": 0.31, + "grad_norm": 4.32056746367122, + "learning_rate": 8.026386848312125e-06, + "loss": 0.3335, + "step": 813 + }, + { + "epoch": 0.31, + "grad_norm": 4.267898004484538, + "learning_rate": 8.021398921583644e-06, + "loss": 0.1832, + "step": 814 + }, + { + "epoch": 0.32, + "grad_norm": 4.80773704394529, + "learning_rate": 8.016406254043595e-06, + "loss": 0.269, + "step": 815 + }, + { + "epoch": 0.32, + "grad_norm": 3.322922906891823, + "learning_rate": 8.01140885352586e-06, + "loss": 0.2891, + "step": 816 + }, + { + "epoch": 0.32, + "grad_norm": 4.229550855302739, + "learning_rate": 8.006406727871754e-06, + "loss": 0.4966, + "step": 817 + }, + { + "epoch": 0.32, + "grad_norm": 5.314039777508532, + "learning_rate": 8.001399884930004e-06, + "loss": 0.6837, + "step": 818 + }, + { + "epoch": 0.32, + "grad_norm": 5.00525793622508, + "learning_rate": 7.996388332556735e-06, + "loss": 0.227, + "step": 819 + }, + { + "epoch": 0.32, + "grad_norm": 4.772809921299112, + "learning_rate": 7.991372078615469e-06, + "loss": 0.292, + "step": 820 + }, + { + "epoch": 0.32, + "grad_norm": 4.66819830861659, + "learning_rate": 7.9863511309771e-06, + "loss": 0.1952, + "step": 821 + }, + { + "epoch": 0.32, + "grad_norm": 5.0325243013892385, + "learning_rate": 7.981325497519892e-06, + "loss": 0.2871, + "step": 822 + }, + { + "epoch": 0.32, + "grad_norm": 7.253237938165674, + "learning_rate": 7.97629518612945e-06, + "loss": 0.6696, + "step": 823 + }, + { + "epoch": 0.32, + "grad_norm": 4.175000357164085, + "learning_rate": 7.971260204698732e-06, + "loss": 0.4533, + "step": 824 + }, + { + "epoch": 0.32, + "grad_norm": 5.069034528454901, + "learning_rate": 7.966220561128018e-06, + "loss": 0.3306, + "step": 825 + }, + { + "epoch": 0.32, + "grad_norm": 5.373203481491902, + "learning_rate": 7.961176263324902e-06, + "loss": 0.5284, + "step": 826 + }, + { + "epoch": 0.32, + "grad_norm": 3.871795914928086, + "learning_rate": 7.95612731920428e-06, + "loss": 0.3533, + "step": 827 + }, + { + "epoch": 0.32, + "grad_norm": 8.248578895471283, + "learning_rate": 7.951073736688348e-06, + "loss": 0.6994, + "step": 828 + }, + { + "epoch": 0.32, + "grad_norm": 4.9304222967760385, + "learning_rate": 7.946015523706566e-06, + "loss": 0.2685, + "step": 829 + }, + { + "epoch": 0.32, + "grad_norm": 5.010468902967375, + "learning_rate": 7.940952688195668e-06, + "loss": 0.6237, + "step": 830 + }, + { + "epoch": 0.32, + "grad_norm": 5.516919378035556, + "learning_rate": 7.93588523809964e-06, + "loss": 0.5792, + "step": 831 + }, + { + "epoch": 0.32, + "grad_norm": 4.710684931910942, + "learning_rate": 7.930813181369713e-06, + "loss": 0.7422, + "step": 832 + }, + { + "epoch": 0.32, + "grad_norm": 4.979364689556802, + "learning_rate": 7.925736525964332e-06, + "loss": 0.2954, + "step": 833 + }, + { + "epoch": 0.32, + "grad_norm": 3.24363576008588, + "learning_rate": 7.920655279849173e-06, + "loss": 0.3, + "step": 834 + }, + { + "epoch": 0.32, + "grad_norm": 7.45551648778925, + "learning_rate": 7.915569450997107e-06, + "loss": 0.4452, + "step": 835 + }, + { + "epoch": 0.32, + "grad_norm": 4.837107505748551, + "learning_rate": 7.9104790473882e-06, + "loss": 0.372, + "step": 836 + }, + { + "epoch": 0.32, + "grad_norm": 5.885385168899783, + "learning_rate": 7.905384077009693e-06, + "loss": 0.261, + "step": 837 + }, + { + "epoch": 0.32, + "grad_norm": 4.728421661944464, + "learning_rate": 7.900284547855992e-06, + "loss": 0.2118, + "step": 838 + }, + { + "epoch": 0.32, + "grad_norm": 5.25899147490279, + "learning_rate": 7.895180467928658e-06, + "loss": 0.6256, + "step": 839 + }, + { + "epoch": 0.32, + "grad_norm": 6.764744182282839, + "learning_rate": 7.890071845236395e-06, + "loss": 0.7904, + "step": 840 + }, + { + "epoch": 0.33, + "grad_norm": 3.3448007090140215, + "learning_rate": 7.88495868779503e-06, + "loss": 0.318, + "step": 841 + }, + { + "epoch": 0.33, + "grad_norm": 4.7238728497394575, + "learning_rate": 7.87984100362751e-06, + "loss": 0.5187, + "step": 842 + }, + { + "epoch": 0.33, + "grad_norm": 5.648105123380729, + "learning_rate": 7.874718800763876e-06, + "loss": 0.3736, + "step": 843 + }, + { + "epoch": 0.33, + "grad_norm": 4.992995102650532, + "learning_rate": 7.869592087241271e-06, + "loss": 0.7633, + "step": 844 + }, + { + "epoch": 0.33, + "grad_norm": 5.697566530596448, + "learning_rate": 7.86446087110391e-06, + "loss": 0.3344, + "step": 845 + }, + { + "epoch": 0.33, + "grad_norm": 6.027726262201498, + "learning_rate": 7.859325160403073e-06, + "loss": 0.3193, + "step": 846 + }, + { + "epoch": 0.33, + "grad_norm": 4.297653433057747, + "learning_rate": 7.854184963197088e-06, + "loss": 0.567, + "step": 847 + }, + { + "epoch": 0.33, + "grad_norm": 3.9724829344373047, + "learning_rate": 7.849040287551331e-06, + "loss": 0.2911, + "step": 848 + }, + { + "epoch": 0.33, + "grad_norm": 5.316916475097543, + "learning_rate": 7.843891141538201e-06, + "loss": 0.3931, + "step": 849 + }, + { + "epoch": 0.33, + "grad_norm": 7.1997865766796885, + "learning_rate": 7.838737533237111e-06, + "loss": 0.3999, + "step": 850 + }, + { + "epoch": 0.33, + "grad_norm": 4.399989189666105, + "learning_rate": 7.833579470734476e-06, + "loss": 0.3233, + "step": 851 + }, + { + "epoch": 0.33, + "grad_norm": 4.0342483467760175, + "learning_rate": 7.8284169621237e-06, + "loss": 0.219, + "step": 852 + }, + { + "epoch": 0.33, + "grad_norm": 4.715006791459248, + "learning_rate": 7.823250015505162e-06, + "loss": 0.4017, + "step": 853 + }, + { + "epoch": 0.33, + "grad_norm": 4.6502726800310175, + "learning_rate": 7.818078638986208e-06, + "loss": 0.3295, + "step": 854 + }, + { + "epoch": 0.33, + "grad_norm": 4.845699780687884, + "learning_rate": 7.812902840681132e-06, + "loss": 0.2155, + "step": 855 + }, + { + "epoch": 0.33, + "grad_norm": 4.466549450449988, + "learning_rate": 7.807722628711167e-06, + "loss": 0.3823, + "step": 856 + }, + { + "epoch": 0.33, + "grad_norm": 3.6928919746660114, + "learning_rate": 7.80253801120447e-06, + "loss": 0.1823, + "step": 857 + }, + { + "epoch": 0.33, + "grad_norm": 4.882294075493104, + "learning_rate": 7.797348996296116e-06, + "loss": 0.3771, + "step": 858 + }, + { + "epoch": 0.33, + "grad_norm": 5.407601911957696, + "learning_rate": 7.792155592128072e-06, + "loss": 0.456, + "step": 859 + }, + { + "epoch": 0.33, + "grad_norm": 5.3998668046998, + "learning_rate": 7.786957806849197e-06, + "loss": 0.5148, + "step": 860 + }, + { + "epoch": 0.33, + "grad_norm": 3.6673883324077163, + "learning_rate": 7.781755648615223e-06, + "loss": 0.148, + "step": 861 + }, + { + "epoch": 0.33, + "grad_norm": 3.145714133004689, + "learning_rate": 7.776549125588743e-06, + "loss": 0.346, + "step": 862 + }, + { + "epoch": 0.33, + "grad_norm": 7.278468737428145, + "learning_rate": 7.771338245939205e-06, + "loss": 0.424, + "step": 863 + }, + { + "epoch": 0.33, + "grad_norm": 5.292056531078291, + "learning_rate": 7.766123017842877e-06, + "loss": 0.4587, + "step": 864 + }, + { + "epoch": 0.33, + "grad_norm": 7.458109108457931, + "learning_rate": 7.76090344948287e-06, + "loss": 0.6041, + "step": 865 + }, + { + "epoch": 0.33, + "grad_norm": 5.349536451305507, + "learning_rate": 7.755679549049093e-06, + "loss": 0.528, + "step": 866 + }, + { + "epoch": 0.34, + "grad_norm": 3.3563335803250114, + "learning_rate": 7.75045132473825e-06, + "loss": 0.0775, + "step": 867 + }, + { + "epoch": 0.34, + "grad_norm": 4.506228819584935, + "learning_rate": 7.745218784753841e-06, + "loss": 0.1533, + "step": 868 + }, + { + "epoch": 0.34, + "grad_norm": 5.195522191451391, + "learning_rate": 7.739981937306127e-06, + "loss": 0.5374, + "step": 869 + }, + { + "epoch": 0.34, + "grad_norm": 5.016228061521943, + "learning_rate": 7.734740790612137e-06, + "loss": 0.4524, + "step": 870 + }, + { + "epoch": 0.34, + "grad_norm": 5.609372709352629, + "learning_rate": 7.729495352895633e-06, + "loss": 0.4124, + "step": 871 + }, + { + "epoch": 0.34, + "grad_norm": 4.531520674537432, + "learning_rate": 7.724245632387122e-06, + "loss": 0.3826, + "step": 872 + }, + { + "epoch": 0.34, + "grad_norm": 11.794990068253417, + "learning_rate": 7.718991637323828e-06, + "loss": 0.5657, + "step": 873 + }, + { + "epoch": 0.34, + "grad_norm": 5.98381321495084, + "learning_rate": 7.713733375949677e-06, + "loss": 0.427, + "step": 874 + }, + { + "epoch": 0.34, + "grad_norm": 5.08443591266254, + "learning_rate": 7.708470856515295e-06, + "loss": 0.1837, + "step": 875 + }, + { + "epoch": 0.34, + "grad_norm": 4.750278414313818, + "learning_rate": 7.703204087277989e-06, + "loss": 0.2697, + "step": 876 + }, + { + "epoch": 0.34, + "grad_norm": 5.06479166683791, + "learning_rate": 7.69793307650173e-06, + "loss": 0.375, + "step": 877 + }, + { + "epoch": 0.34, + "grad_norm": 6.511833008406359, + "learning_rate": 7.692657832457146e-06, + "loss": 0.5867, + "step": 878 + }, + { + "epoch": 0.34, + "grad_norm": 6.270173470987993, + "learning_rate": 7.687378363421512e-06, + "loss": 0.3481, + "step": 879 + }, + { + "epoch": 0.34, + "grad_norm": 4.964743391352324, + "learning_rate": 7.682094677678726e-06, + "loss": 0.2531, + "step": 880 + }, + { + "epoch": 0.34, + "grad_norm": 4.330434416723052, + "learning_rate": 7.676806783519304e-06, + "loss": 0.3912, + "step": 881 + }, + { + "epoch": 0.34, + "grad_norm": 5.75484369586985, + "learning_rate": 7.671514689240366e-06, + "loss": 0.3605, + "step": 882 + }, + { + "epoch": 0.34, + "grad_norm": 4.5040681375813305, + "learning_rate": 7.666218403145625e-06, + "loss": 0.3065, + "step": 883 + }, + { + "epoch": 0.34, + "grad_norm": 6.425655568614444, + "learning_rate": 7.660917933545367e-06, + "loss": 0.3633, + "step": 884 + }, + { + "epoch": 0.34, + "grad_norm": 5.2692885007461765, + "learning_rate": 7.655613288756443e-06, + "loss": 0.2331, + "step": 885 + }, + { + "epoch": 0.34, + "grad_norm": 3.8933744104639296, + "learning_rate": 7.650304477102258e-06, + "loss": 0.272, + "step": 886 + }, + { + "epoch": 0.34, + "grad_norm": 4.999363750590228, + "learning_rate": 7.644991506912753e-06, + "loss": 0.163, + "step": 887 + }, + { + "epoch": 0.34, + "grad_norm": 4.689783040358317, + "learning_rate": 7.639674386524395e-06, + "loss": 0.5043, + "step": 888 + }, + { + "epoch": 0.34, + "grad_norm": 4.595954396481512, + "learning_rate": 7.634353124280161e-06, + "loss": 0.3451, + "step": 889 + }, + { + "epoch": 0.34, + "grad_norm": 3.094488391215071, + "learning_rate": 7.629027728529527e-06, + "loss": 0.1512, + "step": 890 + }, + { + "epoch": 0.34, + "grad_norm": 5.484424693580878, + "learning_rate": 7.623698207628458e-06, + "loss": 0.2746, + "step": 891 + }, + { + "epoch": 0.34, + "grad_norm": 4.3526795832851395, + "learning_rate": 7.61836456993939e-06, + "loss": 0.2982, + "step": 892 + }, + { + "epoch": 0.35, + "grad_norm": 5.764334167761146, + "learning_rate": 7.613026823831217e-06, + "loss": 0.3148, + "step": 893 + }, + { + "epoch": 0.35, + "grad_norm": 4.504617406586926, + "learning_rate": 7.607684977679284e-06, + "loss": 0.1511, + "step": 894 + }, + { + "epoch": 0.35, + "grad_norm": 6.453008004875096, + "learning_rate": 7.602339039865362e-06, + "loss": 0.3653, + "step": 895 + }, + { + "epoch": 0.35, + "grad_norm": 5.738371851296883, + "learning_rate": 7.5969890187776474e-06, + "loss": 0.346, + "step": 896 + }, + { + "epoch": 0.35, + "grad_norm": 4.630936210205719, + "learning_rate": 7.591634922810744e-06, + "loss": 0.2132, + "step": 897 + }, + { + "epoch": 0.35, + "grad_norm": 6.379714333632608, + "learning_rate": 7.586276760365645e-06, + "loss": 0.3159, + "step": 898 + }, + { + "epoch": 0.35, + "grad_norm": 4.547032749723221, + "learning_rate": 7.580914539849731e-06, + "loss": 0.506, + "step": 899 + }, + { + "epoch": 0.35, + "grad_norm": 5.734574813134246, + "learning_rate": 7.575548269676741e-06, + "loss": 0.2792, + "step": 900 + }, + { + "epoch": 0.35, + "grad_norm": 5.110732626869322, + "learning_rate": 7.570177958266775e-06, + "loss": 0.4091, + "step": 901 + }, + { + "epoch": 0.35, + "grad_norm": 5.810898674658677, + "learning_rate": 7.564803614046276e-06, + "loss": 0.5199, + "step": 902 + }, + { + "epoch": 0.35, + "grad_norm": 4.390388498220371, + "learning_rate": 7.559425245448006e-06, + "loss": 0.2347, + "step": 903 + }, + { + "epoch": 0.35, + "grad_norm": 3.9401391012049816, + "learning_rate": 7.554042860911049e-06, + "loss": 0.3488, + "step": 904 + }, + { + "epoch": 0.35, + "grad_norm": 5.375961686591688, + "learning_rate": 7.548656468880788e-06, + "loss": 0.2671, + "step": 905 + }, + { + "epoch": 0.35, + "grad_norm": 4.652853364172935, + "learning_rate": 7.543266077808893e-06, + "loss": 0.3276, + "step": 906 + }, + { + "epoch": 0.35, + "grad_norm": 4.292072425620882, + "learning_rate": 7.53787169615331e-06, + "loss": 0.1898, + "step": 907 + }, + { + "epoch": 0.35, + "grad_norm": 6.401327363677469, + "learning_rate": 7.5324733323782465e-06, + "loss": 0.4271, + "step": 908 + }, + { + "epoch": 0.35, + "grad_norm": 6.544508218544989, + "learning_rate": 7.52707099495416e-06, + "loss": 0.518, + "step": 909 + }, + { + "epoch": 0.35, + "grad_norm": 6.844606060047626, + "learning_rate": 7.521664692357737e-06, + "loss": 0.6057, + "step": 910 + }, + { + "epoch": 0.35, + "grad_norm": 4.842438410217087, + "learning_rate": 7.516254433071894e-06, + "loss": 0.3442, + "step": 911 + }, + { + "epoch": 0.35, + "grad_norm": 6.680098754657996, + "learning_rate": 7.510840225585749e-06, + "loss": 0.1931, + "step": 912 + }, + { + "epoch": 0.35, + "grad_norm": 5.902844697369808, + "learning_rate": 7.50542207839462e-06, + "loss": 0.2623, + "step": 913 + }, + { + "epoch": 0.35, + "grad_norm": 3.8169096747986573, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5817, + "step": 914 + }, + { + "epoch": 0.35, + "grad_norm": 5.191379249441057, + "learning_rate": 7.49457399890956e-06, + "loss": 0.2416, + "step": 915 + }, + { + "epoch": 0.35, + "grad_norm": 5.833328428113343, + "learning_rate": 7.489144083637117e-06, + "loss": 0.635, + "step": 916 + }, + { + "epoch": 0.35, + "grad_norm": 4.489457809497737, + "learning_rate": 7.483710262702635e-06, + "loss": 0.3246, + "step": 917 + }, + { + "epoch": 0.35, + "grad_norm": 4.64131197732772, + "learning_rate": 7.478272544632204e-06, + "loss": 0.2175, + "step": 918 + }, + { + "epoch": 0.36, + "grad_norm": 4.812018202683798, + "learning_rate": 7.472830937958029e-06, + "loss": 0.2529, + "step": 919 + }, + { + "epoch": 0.36, + "grad_norm": 3.718869633788661, + "learning_rate": 7.467385451218418e-06, + "loss": 0.2765, + "step": 920 + }, + { + "epoch": 0.36, + "grad_norm": 6.063079789697448, + "learning_rate": 7.461936092957767e-06, + "loss": 0.4305, + "step": 921 + }, + { + "epoch": 0.36, + "grad_norm": 5.79127333539024, + "learning_rate": 7.456482871726545e-06, + "loss": 0.5141, + "step": 922 + }, + { + "epoch": 0.36, + "grad_norm": 3.728505755068348, + "learning_rate": 7.4510257960812824e-06, + "loss": 0.5281, + "step": 923 + }, + { + "epoch": 0.36, + "grad_norm": 4.1025867041171304, + "learning_rate": 7.44556487458456e-06, + "loss": 0.0836, + "step": 924 + }, + { + "epoch": 0.36, + "grad_norm": 5.678403047928699, + "learning_rate": 7.440100115804991e-06, + "loss": 0.3458, + "step": 925 + }, + { + "epoch": 0.36, + "grad_norm": 5.967428953129573, + "learning_rate": 7.434631528317209e-06, + "loss": 0.4046, + "step": 926 + }, + { + "epoch": 0.36, + "grad_norm": 4.232743773961248, + "learning_rate": 7.4291591207018565e-06, + "loss": 0.3373, + "step": 927 + }, + { + "epoch": 0.36, + "grad_norm": 3.9836939042896895, + "learning_rate": 7.4236829015455725e-06, + "loss": 0.1616, + "step": 928 + }, + { + "epoch": 0.36, + "grad_norm": 4.234073847373181, + "learning_rate": 7.418202879440969e-06, + "loss": 0.4425, + "step": 929 + }, + { + "epoch": 0.36, + "grad_norm": 5.790231896267228, + "learning_rate": 7.412719062986632e-06, + "loss": 0.5823, + "step": 930 + }, + { + "epoch": 0.36, + "grad_norm": 4.178475738206307, + "learning_rate": 7.407231460787099e-06, + "loss": 0.3375, + "step": 931 + }, + { + "epoch": 0.36, + "grad_norm": 4.099374284642818, + "learning_rate": 7.401740081452848e-06, + "loss": 0.6621, + "step": 932 + }, + { + "epoch": 0.36, + "grad_norm": 7.159467108092978, + "learning_rate": 7.396244933600285e-06, + "loss": 0.5217, + "step": 933 + }, + { + "epoch": 0.36, + "grad_norm": 4.9454721846184, + "learning_rate": 7.390746025851725e-06, + "loss": 0.2959, + "step": 934 + }, + { + "epoch": 0.36, + "grad_norm": 6.177230638443921, + "learning_rate": 7.385243366835385e-06, + "loss": 0.5034, + "step": 935 + }, + { + "epoch": 0.36, + "grad_norm": 6.041459142057979, + "learning_rate": 7.379736965185369e-06, + "loss": 0.2509, + "step": 936 + }, + { + "epoch": 0.36, + "grad_norm": 5.909045409772038, + "learning_rate": 7.374226829541652e-06, + "loss": 0.5253, + "step": 937 + }, + { + "epoch": 0.36, + "grad_norm": 4.416554472438831, + "learning_rate": 7.368712968550068e-06, + "loss": 0.4668, + "step": 938 + }, + { + "epoch": 0.36, + "grad_norm": 5.7027022064011295, + "learning_rate": 7.363195390862298e-06, + "loss": 0.2844, + "step": 939 + }, + { + "epoch": 0.36, + "grad_norm": 6.388308131366065, + "learning_rate": 7.3576741051358525e-06, + "loss": 0.4073, + "step": 940 + }, + { + "epoch": 0.36, + "grad_norm": 3.898365393008674, + "learning_rate": 7.352149120034062e-06, + "loss": 0.1619, + "step": 941 + }, + { + "epoch": 0.36, + "grad_norm": 4.8824562479569185, + "learning_rate": 7.3466204442260605e-06, + "loss": 0.4562, + "step": 942 + }, + { + "epoch": 0.36, + "grad_norm": 5.190760940230458, + "learning_rate": 7.341088086386775e-06, + "loss": 0.303, + "step": 943 + }, + { + "epoch": 0.36, + "grad_norm": 3.689279155420741, + "learning_rate": 7.3355520551969055e-06, + "loss": 0.3656, + "step": 944 + }, + { + "epoch": 0.37, + "grad_norm": 7.707317777738777, + "learning_rate": 7.330012359342919e-06, + "loss": 0.2931, + "step": 945 + }, + { + "epoch": 0.37, + "grad_norm": 7.166281060531139, + "learning_rate": 7.324469007517035e-06, + "loss": 0.5711, + "step": 946 + }, + { + "epoch": 0.37, + "grad_norm": 4.3640713862293055, + "learning_rate": 7.318922008417203e-06, + "loss": 0.6074, + "step": 947 + }, + { + "epoch": 0.37, + "grad_norm": 8.97816965650459, + "learning_rate": 7.313371370747104e-06, + "loss": 0.3745, + "step": 948 + }, + { + "epoch": 0.37, + "grad_norm": 6.139472376430774, + "learning_rate": 7.3078171032161175e-06, + "loss": 0.3448, + "step": 949 + }, + { + "epoch": 0.37, + "grad_norm": 4.036326371064701, + "learning_rate": 7.302259214539327e-06, + "loss": 0.5587, + "step": 950 + }, + { + "epoch": 0.37, + "grad_norm": 4.296169035536824, + "learning_rate": 7.296697713437495e-06, + "loss": 0.3618, + "step": 951 + }, + { + "epoch": 0.37, + "grad_norm": 5.890391546892352, + "learning_rate": 7.291132608637053e-06, + "loss": 0.5157, + "step": 952 + }, + { + "epoch": 0.37, + "grad_norm": 4.888042735854257, + "learning_rate": 7.285563908870086e-06, + "loss": 0.2886, + "step": 953 + }, + { + "epoch": 0.37, + "grad_norm": 4.254305854815032, + "learning_rate": 7.279991622874319e-06, + "loss": 0.1928, + "step": 954 + }, + { + "epoch": 0.37, + "grad_norm": 6.173601890042715, + "learning_rate": 7.274415759393107e-06, + "loss": 0.6018, + "step": 955 + }, + { + "epoch": 0.37, + "grad_norm": 6.339315049781077, + "learning_rate": 7.268836327175417e-06, + "loss": 0.5289, + "step": 956 + }, + { + "epoch": 0.37, + "grad_norm": 4.111630198110997, + "learning_rate": 7.263253334975812e-06, + "loss": 0.2405, + "step": 957 + }, + { + "epoch": 0.37, + "grad_norm": 5.280261001417196, + "learning_rate": 7.257666791554448e-06, + "loss": 0.2744, + "step": 958 + }, + { + "epoch": 0.37, + "grad_norm": 3.607057236377991, + "learning_rate": 7.252076705677046e-06, + "loss": 0.3588, + "step": 959 + }, + { + "epoch": 0.37, + "grad_norm": 3.417022781462303, + "learning_rate": 7.24648308611489e-06, + "loss": 0.487, + "step": 960 + }, + { + "epoch": 0.37, + "grad_norm": 5.384842219101503, + "learning_rate": 7.2408859416448065e-06, + "loss": 0.5305, + "step": 961 + }, + { + "epoch": 0.37, + "grad_norm": 4.815721179272225, + "learning_rate": 7.235285281049154e-06, + "loss": 0.194, + "step": 962 + }, + { + "epoch": 0.37, + "grad_norm": 5.1484704211368975, + "learning_rate": 7.22968111311581e-06, + "loss": 0.7335, + "step": 963 + }, + { + "epoch": 0.37, + "grad_norm": 4.792626651193991, + "learning_rate": 7.224073446638149e-06, + "loss": 0.4214, + "step": 964 + }, + { + "epoch": 0.37, + "grad_norm": 5.258760541652446, + "learning_rate": 7.21846229041504e-06, + "loss": 0.4073, + "step": 965 + }, + { + "epoch": 0.37, + "grad_norm": 5.614835640589382, + "learning_rate": 7.212847653250828e-06, + "loss": 0.5402, + "step": 966 + }, + { + "epoch": 0.37, + "grad_norm": 4.609410649170137, + "learning_rate": 7.207229543955319e-06, + "loss": 0.2112, + "step": 967 + }, + { + "epoch": 0.37, + "grad_norm": 4.816974369567264, + "learning_rate": 7.201607971343763e-06, + "loss": 0.2038, + "step": 968 + }, + { + "epoch": 0.37, + "grad_norm": 7.236157249748036, + "learning_rate": 7.195982944236853e-06, + "loss": 0.2521, + "step": 969 + }, + { + "epoch": 0.38, + "grad_norm": 5.402175205571623, + "learning_rate": 7.190354471460692e-06, + "loss": 0.2382, + "step": 970 + }, + { + "epoch": 0.38, + "grad_norm": 4.819611590718435, + "learning_rate": 7.1847225618467975e-06, + "loss": 0.3573, + "step": 971 + }, + { + "epoch": 0.38, + "grad_norm": 3.1465471735663746, + "learning_rate": 7.1790872242320775e-06, + "loss": 0.1005, + "step": 972 + }, + { + "epoch": 0.38, + "grad_norm": 5.640336574149923, + "learning_rate": 7.173448467458817e-06, + "loss": 0.3745, + "step": 973 + }, + { + "epoch": 0.38, + "grad_norm": 5.500844710543615, + "learning_rate": 7.167806300374665e-06, + "loss": 0.6411, + "step": 974 + }, + { + "epoch": 0.38, + "grad_norm": 5.689235896418885, + "learning_rate": 7.162160731832627e-06, + "loss": 0.3168, + "step": 975 + }, + { + "epoch": 0.38, + "grad_norm": 5.09472144596906, + "learning_rate": 7.15651177069104e-06, + "loss": 0.5588, + "step": 976 + }, + { + "epoch": 0.38, + "grad_norm": 3.4432814427661684, + "learning_rate": 7.1508594258135685e-06, + "loss": 0.4689, + "step": 977 + }, + { + "epoch": 0.38, + "grad_norm": 4.543801171222169, + "learning_rate": 7.145203706069183e-06, + "loss": 0.3286, + "step": 978 + }, + { + "epoch": 0.38, + "grad_norm": 3.6396114540245357, + "learning_rate": 7.139544620332151e-06, + "loss": 0.2385, + "step": 979 + }, + { + "epoch": 0.38, + "grad_norm": 3.974055244651273, + "learning_rate": 7.133882177482019e-06, + "loss": 0.4335, + "step": 980 + }, + { + "epoch": 0.38, + "grad_norm": 4.56871393982033, + "learning_rate": 7.128216386403608e-06, + "loss": 0.46, + "step": 981 + }, + { + "epoch": 0.38, + "grad_norm": 5.489308899737765, + "learning_rate": 7.122547255986985e-06, + "loss": 0.4826, + "step": 982 + }, + { + "epoch": 0.38, + "grad_norm": 4.607582520293363, + "learning_rate": 7.1168747951274596e-06, + "loss": 0.2561, + "step": 983 + }, + { + "epoch": 0.38, + "grad_norm": 3.463694616722066, + "learning_rate": 7.1111990127255684e-06, + "loss": 0.4603, + "step": 984 + }, + { + "epoch": 0.38, + "grad_norm": 5.945644671403082, + "learning_rate": 7.105519917687058e-06, + "loss": 0.6607, + "step": 985 + }, + { + "epoch": 0.38, + "grad_norm": 4.750019937917883, + "learning_rate": 7.099837518922873e-06, + "loss": 0.1904, + "step": 986 + }, + { + "epoch": 0.38, + "grad_norm": 4.482299071320674, + "learning_rate": 7.094151825349145e-06, + "loss": 0.3152, + "step": 987 + }, + { + "epoch": 0.38, + "grad_norm": 3.8434907070965467, + "learning_rate": 7.088462845887168e-06, + "loss": 0.4441, + "step": 988 + }, + { + "epoch": 0.38, + "grad_norm": 4.773628678898187, + "learning_rate": 7.082770589463398e-06, + "loss": 0.4988, + "step": 989 + }, + { + "epoch": 0.38, + "grad_norm": 4.213841525985786, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.4936, + "step": 990 + }, + { + "epoch": 0.38, + "grad_norm": 4.432768016996358, + "learning_rate": 7.071376281461994e-06, + "loss": 0.4266, + "step": 991 + }, + { + "epoch": 0.38, + "grad_norm": 6.157815564365318, + "learning_rate": 7.065674247762924e-06, + "loss": 0.5153, + "step": 992 + }, + { + "epoch": 0.38, + "grad_norm": 4.029204598653521, + "learning_rate": 7.059968972859155e-06, + "loss": 0.4522, + "step": 993 + }, + { + "epoch": 0.38, + "grad_norm": 4.046118804737403, + "learning_rate": 7.054260465702712e-06, + "loss": 0.4115, + "step": 994 + }, + { + "epoch": 0.38, + "grad_norm": 4.399234145678143, + "learning_rate": 7.04854873525069e-06, + "loss": 0.3488, + "step": 995 + }, + { + "epoch": 0.39, + "grad_norm": 4.811268916684834, + "learning_rate": 7.042833790465241e-06, + "loss": 0.442, + "step": 996 + }, + { + "epoch": 0.39, + "grad_norm": 6.097682814805097, + "learning_rate": 7.037115640313563e-06, + "loss": 0.5786, + "step": 997 + }, + { + "epoch": 0.39, + "grad_norm": 3.89406435386151, + "learning_rate": 7.031394293767879e-06, + "loss": 0.2893, + "step": 998 + }, + { + "epoch": 0.39, + "grad_norm": 4.741921341840173, + "learning_rate": 7.025669759805431e-06, + "loss": 0.3645, + "step": 999 + }, + { + "epoch": 0.39, + "grad_norm": 4.097084643366875, + "learning_rate": 7.019942047408461e-06, + "loss": 0.1835, + "step": 1000 + }, + { + "epoch": 0.39, + "grad_norm": 2.6627766749272523, + "learning_rate": 7.0142111655642e-06, + "loss": 0.2201, + "step": 1001 + }, + { + "epoch": 0.39, + "grad_norm": 4.037169315513487, + "learning_rate": 7.008477123264849e-06, + "loss": 0.2294, + "step": 1002 + }, + { + "epoch": 0.39, + "grad_norm": 4.723506514756312, + "learning_rate": 7.002739929507569e-06, + "loss": 0.2124, + "step": 1003 + }, + { + "epoch": 0.39, + "grad_norm": 5.303370968502258, + "learning_rate": 6.996999593294466e-06, + "loss": 0.2925, + "step": 1004 + }, + { + "epoch": 0.39, + "grad_norm": 6.279075537465586, + "learning_rate": 6.991256123632577e-06, + "loss": 0.7073, + "step": 1005 + }, + { + "epoch": 0.39, + "grad_norm": 5.04058869264715, + "learning_rate": 6.985509529533859e-06, + "loss": 0.5377, + "step": 1006 + }, + { + "epoch": 0.39, + "grad_norm": 3.8875963143709975, + "learning_rate": 6.979759820015166e-06, + "loss": 0.4554, + "step": 1007 + }, + { + "epoch": 0.39, + "grad_norm": 5.406641061009185, + "learning_rate": 6.974007004098243e-06, + "loss": 0.4458, + "step": 1008 + }, + { + "epoch": 0.39, + "grad_norm": 7.93528175971489, + "learning_rate": 6.968251090809708e-06, + "loss": 0.2491, + "step": 1009 + }, + { + "epoch": 0.39, + "grad_norm": 3.046559605036485, + "learning_rate": 6.96249208918104e-06, + "loss": 0.2308, + "step": 1010 + }, + { + "epoch": 0.39, + "grad_norm": 4.406269492678815, + "learning_rate": 6.956730008248565e-06, + "loss": 0.3685, + "step": 1011 + }, + { + "epoch": 0.39, + "grad_norm": 4.898383037743084, + "learning_rate": 6.950964857053436e-06, + "loss": 0.2557, + "step": 1012 + }, + { + "epoch": 0.39, + "grad_norm": 4.800167808069986, + "learning_rate": 6.945196644641631e-06, + "loss": 0.3821, + "step": 1013 + }, + { + "epoch": 0.39, + "grad_norm": 4.1650493510014055, + "learning_rate": 6.939425380063924e-06, + "loss": 0.5418, + "step": 1014 + }, + { + "epoch": 0.39, + "grad_norm": 8.478798815675141, + "learning_rate": 6.933651072375884e-06, + "loss": 0.2895, + "step": 1015 + }, + { + "epoch": 0.39, + "grad_norm": 2.7967882889163005, + "learning_rate": 6.927873730637849e-06, + "loss": 0.2873, + "step": 1016 + }, + { + "epoch": 0.39, + "grad_norm": 8.968736109982194, + "learning_rate": 6.922093363914923e-06, + "loss": 0.3376, + "step": 1017 + }, + { + "epoch": 0.39, + "grad_norm": 3.9904418841960014, + "learning_rate": 6.916309981276954e-06, + "loss": 0.4059, + "step": 1018 + }, + { + "epoch": 0.39, + "grad_norm": 4.889256192891055, + "learning_rate": 6.9105235917985215e-06, + "loss": 0.3218, + "step": 1019 + }, + { + "epoch": 0.39, + "grad_norm": 5.110778907024558, + "learning_rate": 6.9047342045589224e-06, + "loss": 0.4627, + "step": 1020 + }, + { + "epoch": 0.39, + "grad_norm": 6.685777372986321, + "learning_rate": 6.898941828642163e-06, + "loss": 0.4065, + "step": 1021 + }, + { + "epoch": 0.4, + "grad_norm": 6.331353037515637, + "learning_rate": 6.89314647313693e-06, + "loss": 0.342, + "step": 1022 + }, + { + "epoch": 0.4, + "grad_norm": 4.765773716278677, + "learning_rate": 6.887348147136592e-06, + "loss": 0.2128, + "step": 1023 + }, + { + "epoch": 0.4, + "grad_norm": 3.447548302533927, + "learning_rate": 6.8815468597391785e-06, + "loss": 0.4476, + "step": 1024 + }, + { + "epoch": 0.4, + "grad_norm": 5.147038446259682, + "learning_rate": 6.875742620047361e-06, + "loss": 0.4365, + "step": 1025 + }, + { + "epoch": 0.4, + "grad_norm": 3.0080880227394027, + "learning_rate": 6.869935437168449e-06, + "loss": 0.2937, + "step": 1026 + }, + { + "epoch": 0.4, + "grad_norm": 4.713681508637163, + "learning_rate": 6.864125320214364e-06, + "loss": 0.311, + "step": 1027 + }, + { + "epoch": 0.4, + "grad_norm": 3.9305866178493307, + "learning_rate": 6.858312278301638e-06, + "loss": 0.4492, + "step": 1028 + }, + { + "epoch": 0.4, + "grad_norm": 5.132501267665453, + "learning_rate": 6.852496320551387e-06, + "loss": 0.2314, + "step": 1029 + }, + { + "epoch": 0.4, + "grad_norm": 7.581024219407659, + "learning_rate": 6.846677456089305e-06, + "loss": 0.6607, + "step": 1030 + }, + { + "epoch": 0.4, + "grad_norm": 5.6534322322023804, + "learning_rate": 6.840855694045647e-06, + "loss": 0.222, + "step": 1031 + }, + { + "epoch": 0.4, + "grad_norm": 3.9230792561451957, + "learning_rate": 6.835031043555211e-06, + "loss": 0.4596, + "step": 1032 + }, + { + "epoch": 0.4, + "grad_norm": 5.4441479989012125, + "learning_rate": 6.829203513757332e-06, + "loss": 0.4975, + "step": 1033 + }, + { + "epoch": 0.4, + "grad_norm": 4.960178871281018, + "learning_rate": 6.82337311379586e-06, + "loss": 0.2867, + "step": 1034 + }, + { + "epoch": 0.4, + "grad_norm": 3.6012987952419007, + "learning_rate": 6.817539852819149e-06, + "loss": 0.2068, + "step": 1035 + }, + { + "epoch": 0.4, + "grad_norm": 9.873659106808283, + "learning_rate": 6.811703739980045e-06, + "loss": 0.4902, + "step": 1036 + }, + { + "epoch": 0.4, + "grad_norm": 3.9016539751511976, + "learning_rate": 6.8058647844358625e-06, + "loss": 0.4564, + "step": 1037 + }, + { + "epoch": 0.4, + "grad_norm": 4.615831624104007, + "learning_rate": 6.800022995348381e-06, + "loss": 0.2232, + "step": 1038 + }, + { + "epoch": 0.4, + "grad_norm": 5.517887212480329, + "learning_rate": 6.794178381883827e-06, + "loss": 0.3346, + "step": 1039 + }, + { + "epoch": 0.4, + "grad_norm": 3.874539223333172, + "learning_rate": 6.788330953212854e-06, + "loss": 0.4881, + "step": 1040 + }, + { + "epoch": 0.4, + "grad_norm": 7.216710935786579, + "learning_rate": 6.782480718510538e-06, + "loss": 0.6149, + "step": 1041 + }, + { + "epoch": 0.4, + "grad_norm": 5.037340989288139, + "learning_rate": 6.776627686956354e-06, + "loss": 0.5184, + "step": 1042 + }, + { + "epoch": 0.4, + "grad_norm": 4.499040907639204, + "learning_rate": 6.770771867734169e-06, + "loss": 0.6088, + "step": 1043 + }, + { + "epoch": 0.4, + "grad_norm": 4.994247873916255, + "learning_rate": 6.76491327003222e-06, + "loss": 0.6212, + "step": 1044 + }, + { + "epoch": 0.4, + "grad_norm": 4.941613195271821, + "learning_rate": 6.7590519030431054e-06, + "loss": 0.4855, + "step": 1045 + }, + { + "epoch": 0.4, + "grad_norm": 5.294732445961128, + "learning_rate": 6.753187775963773e-06, + "loss": 0.2688, + "step": 1046 + }, + { + "epoch": 0.4, + "grad_norm": 6.768122707458756, + "learning_rate": 6.747320897995493e-06, + "loss": 0.5366, + "step": 1047 + }, + { + "epoch": 0.41, + "grad_norm": 4.2196424193217545, + "learning_rate": 6.7414512783438575e-06, + "loss": 0.3854, + "step": 1048 + }, + { + "epoch": 0.41, + "grad_norm": 4.3578808239763775, + "learning_rate": 6.73557892621876e-06, + "loss": 0.348, + "step": 1049 + }, + { + "epoch": 0.41, + "grad_norm": 5.502428374052916, + "learning_rate": 6.729703850834381e-06, + "loss": 0.3777, + "step": 1050 + }, + { + "epoch": 0.41, + "grad_norm": 3.9362328259032635, + "learning_rate": 6.723826061409176e-06, + "loss": 0.3594, + "step": 1051 + }, + { + "epoch": 0.41, + "grad_norm": 5.30420303398428, + "learning_rate": 6.717945567165854e-06, + "loss": 0.4955, + "step": 1052 + }, + { + "epoch": 0.41, + "grad_norm": 5.615544401286117, + "learning_rate": 6.712062377331371e-06, + "loss": 0.5446, + "step": 1053 + }, + { + "epoch": 0.41, + "grad_norm": 6.902992375707539, + "learning_rate": 6.706176501136914e-06, + "loss": 0.3112, + "step": 1054 + }, + { + "epoch": 0.41, + "grad_norm": 5.408063450853472, + "learning_rate": 6.700287947817885e-06, + "loss": 0.2684, + "step": 1055 + }, + { + "epoch": 0.41, + "grad_norm": 4.036836225427641, + "learning_rate": 6.694396726613883e-06, + "loss": 0.2459, + "step": 1056 + }, + { + "epoch": 0.41, + "grad_norm": 5.066047406609241, + "learning_rate": 6.688502846768697e-06, + "loss": 0.513, + "step": 1057 + }, + { + "epoch": 0.41, + "grad_norm": 3.7869968785109425, + "learning_rate": 6.682606317530284e-06, + "loss": 0.172, + "step": 1058 + }, + { + "epoch": 0.41, + "grad_norm": 7.359044285906457, + "learning_rate": 6.676707148150763e-06, + "loss": 0.4342, + "step": 1059 + }, + { + "epoch": 0.41, + "grad_norm": 5.698820258121499, + "learning_rate": 6.670805347886392e-06, + "loss": 0.5618, + "step": 1060 + }, + { + "epoch": 0.41, + "grad_norm": 5.425351971010927, + "learning_rate": 6.6649009259975585e-06, + "loss": 0.4421, + "step": 1061 + }, + { + "epoch": 0.41, + "grad_norm": 5.285062575279868, + "learning_rate": 6.65899389174876e-06, + "loss": 0.2069, + "step": 1062 + }, + { + "epoch": 0.41, + "grad_norm": 3.8461884048317, + "learning_rate": 6.653084254408599e-06, + "loss": 0.4529, + "step": 1063 + }, + { + "epoch": 0.41, + "grad_norm": 6.527993871516133, + "learning_rate": 6.647172023249758e-06, + "loss": 0.5812, + "step": 1064 + }, + { + "epoch": 0.41, + "grad_norm": 4.838554724261919, + "learning_rate": 6.641257207548992e-06, + "loss": 0.3948, + "step": 1065 + }, + { + "epoch": 0.41, + "grad_norm": 3.8931808000422543, + "learning_rate": 6.635339816587109e-06, + "loss": 0.1686, + "step": 1066 + }, + { + "epoch": 0.41, + "grad_norm": 4.92898728170045, + "learning_rate": 6.629419859648959e-06, + "loss": 0.3539, + "step": 1067 + }, + { + "epoch": 0.41, + "grad_norm": 8.054418910841877, + "learning_rate": 6.6234973460234184e-06, + "loss": 0.6506, + "step": 1068 + }, + { + "epoch": 0.41, + "grad_norm": 4.327327643023074, + "learning_rate": 6.6175722850033755e-06, + "loss": 0.237, + "step": 1069 + }, + { + "epoch": 0.41, + "grad_norm": 6.088852064696572, + "learning_rate": 6.611644685885713e-06, + "loss": 0.3575, + "step": 1070 + }, + { + "epoch": 0.41, + "grad_norm": 6.5379329445691985, + "learning_rate": 6.6057145579713015e-06, + "loss": 0.3103, + "step": 1071 + }, + { + "epoch": 0.41, + "grad_norm": 5.238507620341176, + "learning_rate": 6.599781910564973e-06, + "loss": 0.5496, + "step": 1072 + }, + { + "epoch": 0.41, + "grad_norm": 4.2361587413049415, + "learning_rate": 6.593846752975518e-06, + "loss": 0.3359, + "step": 1073 + }, + { + "epoch": 0.42, + "grad_norm": 6.723970206443219, + "learning_rate": 6.587909094515663e-06, + "loss": 0.5703, + "step": 1074 + }, + { + "epoch": 0.42, + "grad_norm": 3.1645926050266273, + "learning_rate": 6.58196894450206e-06, + "loss": 0.0898, + "step": 1075 + }, + { + "epoch": 0.42, + "grad_norm": 3.90095699604733, + "learning_rate": 6.576026312255268e-06, + "loss": 0.4937, + "step": 1076 + }, + { + "epoch": 0.42, + "grad_norm": 5.332468638885417, + "learning_rate": 6.570081207099744e-06, + "loss": 0.5269, + "step": 1077 + }, + { + "epoch": 0.42, + "grad_norm": 3.85470005034806, + "learning_rate": 6.564133638363823e-06, + "loss": 0.2602, + "step": 1078 + }, + { + "epoch": 0.42, + "grad_norm": 4.026178179195995, + "learning_rate": 6.558183615379708e-06, + "loss": 0.3851, + "step": 1079 + }, + { + "epoch": 0.42, + "grad_norm": 5.683079412951481, + "learning_rate": 6.552231147483448e-06, + "loss": 0.537, + "step": 1080 + }, + { + "epoch": 0.42, + "grad_norm": 4.4753014518903536, + "learning_rate": 6.546276244014933e-06, + "loss": 0.3069, + "step": 1081 + }, + { + "epoch": 0.42, + "grad_norm": 4.759588871696704, + "learning_rate": 6.5403189143178725e-06, + "loss": 0.3842, + "step": 1082 + }, + { + "epoch": 0.42, + "grad_norm": 3.658300705288632, + "learning_rate": 6.534359167739784e-06, + "loss": 0.5289, + "step": 1083 + }, + { + "epoch": 0.42, + "grad_norm": 7.710464926969626, + "learning_rate": 6.528397013631975e-06, + "loss": 0.2284, + "step": 1084 + }, + { + "epoch": 0.42, + "grad_norm": 4.333273685475453, + "learning_rate": 6.522432461349536e-06, + "loss": 0.2599, + "step": 1085 + }, + { + "epoch": 0.42, + "grad_norm": 3.5792537507612883, + "learning_rate": 6.5164655202513135e-06, + "loss": 0.11, + "step": 1086 + }, + { + "epoch": 0.42, + "grad_norm": 5.530875134448532, + "learning_rate": 6.510496199699906e-06, + "loss": 0.5215, + "step": 1087 + }, + { + "epoch": 0.42, + "grad_norm": 3.8562640009376037, + "learning_rate": 6.504524509061646e-06, + "loss": 0.2141, + "step": 1088 + }, + { + "epoch": 0.42, + "grad_norm": 4.487716847033595, + "learning_rate": 6.498550457706584e-06, + "loss": 0.2573, + "step": 1089 + }, + { + "epoch": 0.42, + "grad_norm": 5.292321959679036, + "learning_rate": 6.492574055008474e-06, + "loss": 0.3859, + "step": 1090 + }, + { + "epoch": 0.42, + "grad_norm": 6.250167288607771, + "learning_rate": 6.4865953103447585e-06, + "loss": 0.1826, + "step": 1091 + }, + { + "epoch": 0.42, + "grad_norm": 4.002364415651722, + "learning_rate": 6.480614233096558e-06, + "loss": 0.2539, + "step": 1092 + }, + { + "epoch": 0.42, + "grad_norm": 3.399024878313421, + "learning_rate": 6.474630832648651e-06, + "loss": 0.1673, + "step": 1093 + }, + { + "epoch": 0.42, + "grad_norm": 3.575294574941341, + "learning_rate": 6.4686451183894604e-06, + "loss": 0.224, + "step": 1094 + }, + { + "epoch": 0.42, + "grad_norm": 6.657883459327564, + "learning_rate": 6.462657099711044e-06, + "loss": 0.4167, + "step": 1095 + }, + { + "epoch": 0.42, + "grad_norm": 3.6453216067272676, + "learning_rate": 6.4566667860090674e-06, + "loss": 0.3451, + "step": 1096 + }, + { + "epoch": 0.42, + "grad_norm": 6.65412738800545, + "learning_rate": 6.4506741866828035e-06, + "loss": 0.4441, + "step": 1097 + }, + { + "epoch": 0.42, + "grad_norm": 5.392961673927681, + "learning_rate": 6.444679311135112e-06, + "loss": 0.404, + "step": 1098 + }, + { + "epoch": 0.42, + "grad_norm": 4.858981707508947, + "learning_rate": 6.438682168772421e-06, + "loss": 0.5141, + "step": 1099 + }, + { + "epoch": 0.43, + "grad_norm": 5.983253209510925, + "learning_rate": 6.432682769004717e-06, + "loss": 0.301, + "step": 1100 + }, + { + "epoch": 0.43, + "grad_norm": 4.103632831354062, + "learning_rate": 6.426681121245527e-06, + "loss": 0.0999, + "step": 1101 + }, + { + "epoch": 0.43, + "grad_norm": 5.462511305934385, + "learning_rate": 6.420677234911908e-06, + "loss": 0.2751, + "step": 1102 + }, + { + "epoch": 0.43, + "grad_norm": 4.241387633043713, + "learning_rate": 6.414671119424426e-06, + "loss": 0.2869, + "step": 1103 + }, + { + "epoch": 0.43, + "grad_norm": 5.0268500807699805, + "learning_rate": 6.408662784207149e-06, + "loss": 0.4188, + "step": 1104 + }, + { + "epoch": 0.43, + "grad_norm": 5.99883312346557, + "learning_rate": 6.402652238687624e-06, + "loss": 0.4171, + "step": 1105 + }, + { + "epoch": 0.43, + "grad_norm": 3.513925241895601, + "learning_rate": 6.396639492296868e-06, + "loss": 0.103, + "step": 1106 + }, + { + "epoch": 0.43, + "grad_norm": 4.692900850075641, + "learning_rate": 6.390624554469351e-06, + "loss": 0.333, + "step": 1107 + }, + { + "epoch": 0.43, + "grad_norm": 4.17466899381888, + "learning_rate": 6.384607434642982e-06, + "loss": 0.6551, + "step": 1108 + }, + { + "epoch": 0.43, + "grad_norm": 4.87571778209691, + "learning_rate": 6.378588142259094e-06, + "loss": 0.4127, + "step": 1109 + }, + { + "epoch": 0.43, + "grad_norm": 3.647447251806186, + "learning_rate": 6.372566686762427e-06, + "loss": 0.2528, + "step": 1110 + }, + { + "epoch": 0.43, + "grad_norm": 4.7169210061877, + "learning_rate": 6.366543077601116e-06, + "loss": 0.3252, + "step": 1111 + }, + { + "epoch": 0.43, + "grad_norm": 7.381070555240646, + "learning_rate": 6.360517324226676e-06, + "loss": 0.2383, + "step": 1112 + }, + { + "epoch": 0.43, + "grad_norm": 5.056682885353319, + "learning_rate": 6.354489436093987e-06, + "loss": 0.3579, + "step": 1113 + }, + { + "epoch": 0.43, + "grad_norm": 5.728850574726878, + "learning_rate": 6.348459422661276e-06, + "loss": 0.5767, + "step": 1114 + }, + { + "epoch": 0.43, + "grad_norm": 4.7650735524192065, + "learning_rate": 6.342427293390108e-06, + "loss": 0.4645, + "step": 1115 + }, + { + "epoch": 0.43, + "grad_norm": 4.107658350302222, + "learning_rate": 6.336393057745365e-06, + "loss": 0.5181, + "step": 1116 + }, + { + "epoch": 0.43, + "grad_norm": 5.050427103553093, + "learning_rate": 6.330356725195237e-06, + "loss": 0.4415, + "step": 1117 + }, + { + "epoch": 0.43, + "grad_norm": 4.113583600744585, + "learning_rate": 6.324318305211201e-06, + "loss": 0.2275, + "step": 1118 + }, + { + "epoch": 0.43, + "grad_norm": 5.177665868869402, + "learning_rate": 6.318277807268013e-06, + "loss": 0.4803, + "step": 1119 + }, + { + "epoch": 0.43, + "grad_norm": 5.894687258632692, + "learning_rate": 6.312235240843685e-06, + "loss": 0.4936, + "step": 1120 + }, + { + "epoch": 0.43, + "grad_norm": 4.75435248759306, + "learning_rate": 6.306190615419476e-06, + "loss": 0.22, + "step": 1121 + }, + { + "epoch": 0.43, + "grad_norm": 8.892314258562756, + "learning_rate": 6.300143940479881e-06, + "loss": 0.6011, + "step": 1122 + }, + { + "epoch": 0.43, + "grad_norm": 4.379855244418597, + "learning_rate": 6.294095225512604e-06, + "loss": 0.3588, + "step": 1123 + }, + { + "epoch": 0.43, + "grad_norm": 2.884767438661871, + "learning_rate": 6.2880444800085535e-06, + "loss": 0.2015, + "step": 1124 + }, + { + "epoch": 0.43, + "grad_norm": 4.551356667662048, + "learning_rate": 6.281991713461823e-06, + "loss": 0.2999, + "step": 1125 + }, + { + "epoch": 0.44, + "grad_norm": 5.02565697507532, + "learning_rate": 6.275936935369675e-06, + "loss": 0.3088, + "step": 1126 + }, + { + "epoch": 0.44, + "grad_norm": 4.259174749542063, + "learning_rate": 6.269880155232534e-06, + "loss": 0.2645, + "step": 1127 + }, + { + "epoch": 0.44, + "grad_norm": 4.1260340427461255, + "learning_rate": 6.2638213825539595e-06, + "loss": 0.2912, + "step": 1128 + }, + { + "epoch": 0.44, + "grad_norm": 4.3305769669563485, + "learning_rate": 6.257760626840644e-06, + "loss": 0.3689, + "step": 1129 + }, + { + "epoch": 0.44, + "grad_norm": 5.018191721498685, + "learning_rate": 6.251697897602384e-06, + "loss": 0.4977, + "step": 1130 + }, + { + "epoch": 0.44, + "grad_norm": 5.162623471632947, + "learning_rate": 6.245633204352078e-06, + "loss": 0.2183, + "step": 1131 + }, + { + "epoch": 0.44, + "grad_norm": 4.104107461505769, + "learning_rate": 6.239566556605706e-06, + "loss": 0.411, + "step": 1132 + }, + { + "epoch": 0.44, + "grad_norm": 5.719875066793219, + "learning_rate": 6.233497963882314e-06, + "loss": 0.5093, + "step": 1133 + }, + { + "epoch": 0.44, + "grad_norm": 4.794565517083581, + "learning_rate": 6.227427435703997e-06, + "loss": 0.4455, + "step": 1134 + }, + { + "epoch": 0.44, + "grad_norm": 4.032460892724786, + "learning_rate": 6.22135498159589e-06, + "loss": 0.4597, + "step": 1135 + }, + { + "epoch": 0.44, + "grad_norm": 5.015209913146625, + "learning_rate": 6.215280611086149e-06, + "loss": 0.5067, + "step": 1136 + }, + { + "epoch": 0.44, + "grad_norm": 4.075035112182945, + "learning_rate": 6.209204333705937e-06, + "loss": 0.4369, + "step": 1137 + }, + { + "epoch": 0.44, + "grad_norm": 4.93502641479607, + "learning_rate": 6.203126158989411e-06, + "loss": 0.4362, + "step": 1138 + }, + { + "epoch": 0.44, + "grad_norm": 5.815104553602145, + "learning_rate": 6.197046096473701e-06, + "loss": 0.4026, + "step": 1139 + }, + { + "epoch": 0.44, + "grad_norm": 5.051113586771054, + "learning_rate": 6.190964155698903e-06, + "loss": 0.5172, + "step": 1140 + }, + { + "epoch": 0.44, + "grad_norm": 4.723729964761336, + "learning_rate": 6.184880346208056e-06, + "loss": 0.3351, + "step": 1141 + }, + { + "epoch": 0.44, + "grad_norm": 3.1836934395419347, + "learning_rate": 6.178794677547138e-06, + "loss": 0.3614, + "step": 1142 + }, + { + "epoch": 0.44, + "grad_norm": 6.2274658876489495, + "learning_rate": 6.1727071592650345e-06, + "loss": 0.3113, + "step": 1143 + }, + { + "epoch": 0.44, + "grad_norm": 5.908194274279897, + "learning_rate": 6.1666178009135426e-06, + "loss": 0.4549, + "step": 1144 + }, + { + "epoch": 0.44, + "grad_norm": 4.863167550276418, + "learning_rate": 6.160526612047339e-06, + "loss": 0.4472, + "step": 1145 + }, + { + "epoch": 0.44, + "grad_norm": 4.81905721465025, + "learning_rate": 6.154433602223979e-06, + "loss": 0.2906, + "step": 1146 + }, + { + "epoch": 0.44, + "grad_norm": 5.870031172516451, + "learning_rate": 6.148338781003873e-06, + "loss": 0.3564, + "step": 1147 + }, + { + "epoch": 0.44, + "grad_norm": 6.634649363433993, + "learning_rate": 6.14224215795027e-06, + "loss": 0.3849, + "step": 1148 + }, + { + "epoch": 0.44, + "grad_norm": 5.024500636140412, + "learning_rate": 6.136143742629252e-06, + "loss": 0.4495, + "step": 1149 + }, + { + "epoch": 0.44, + "grad_norm": 4.668985557086553, + "learning_rate": 6.130043544609707e-06, + "loss": 0.4648, + "step": 1150 + }, + { + "epoch": 0.44, + "grad_norm": 5.42891353501445, + "learning_rate": 6.123941573463327e-06, + "loss": 0.3441, + "step": 1151 + }, + { + "epoch": 0.45, + "grad_norm": 4.857536388028443, + "learning_rate": 6.117837838764579e-06, + "loss": 0.5887, + "step": 1152 + }, + { + "epoch": 0.45, + "grad_norm": 3.2776577992229217, + "learning_rate": 6.111732350090703e-06, + "loss": 0.3666, + "step": 1153 + }, + { + "epoch": 0.45, + "grad_norm": 5.096908639197986, + "learning_rate": 6.105625117021692e-06, + "loss": 0.4971, + "step": 1154 + }, + { + "epoch": 0.45, + "grad_norm": 3.3840741300229147, + "learning_rate": 6.099516149140267e-06, + "loss": 0.1509, + "step": 1155 + }, + { + "epoch": 0.45, + "grad_norm": 5.6619159705833555, + "learning_rate": 6.09340545603188e-06, + "loss": 0.4491, + "step": 1156 + }, + { + "epoch": 0.45, + "grad_norm": 5.258172288915472, + "learning_rate": 6.087293047284687e-06, + "loss": 0.5557, + "step": 1157 + }, + { + "epoch": 0.45, + "grad_norm": 5.236971319237751, + "learning_rate": 6.0811789324895365e-06, + "loss": 0.537, + "step": 1158 + }, + { + "epoch": 0.45, + "grad_norm": 5.080538142556825, + "learning_rate": 6.075063121239954e-06, + "loss": 0.5135, + "step": 1159 + }, + { + "epoch": 0.45, + "grad_norm": 4.884510279760993, + "learning_rate": 6.068945623132125e-06, + "loss": 0.3686, + "step": 1160 + }, + { + "epoch": 0.45, + "grad_norm": 6.718368897824202, + "learning_rate": 6.062826447764883e-06, + "loss": 0.2197, + "step": 1161 + }, + { + "epoch": 0.45, + "grad_norm": 5.142915658609104, + "learning_rate": 6.056705604739696e-06, + "loss": 0.302, + "step": 1162 + }, + { + "epoch": 0.45, + "grad_norm": 4.38869337946155, + "learning_rate": 6.050583103660643e-06, + "loss": 0.3433, + "step": 1163 + }, + { + "epoch": 0.45, + "grad_norm": 6.333723556530458, + "learning_rate": 6.044458954134411e-06, + "loss": 0.3373, + "step": 1164 + }, + { + "epoch": 0.45, + "grad_norm": 5.1630218045058935, + "learning_rate": 6.038333165770267e-06, + "loss": 0.2824, + "step": 1165 + }, + { + "epoch": 0.45, + "grad_norm": 2.964394593601022, + "learning_rate": 6.032205748180054e-06, + "loss": 0.3608, + "step": 1166 + }, + { + "epoch": 0.45, + "grad_norm": 5.07047404818321, + "learning_rate": 6.026076710978172e-06, + "loss": 0.4182, + "step": 1167 + }, + { + "epoch": 0.45, + "grad_norm": 3.965978987107587, + "learning_rate": 6.019946063781559e-06, + "loss": 0.1604, + "step": 1168 + }, + { + "epoch": 0.45, + "grad_norm": 5.586994011253932, + "learning_rate": 6.01381381620968e-06, + "loss": 0.4027, + "step": 1169 + }, + { + "epoch": 0.45, + "grad_norm": 4.133231119435957, + "learning_rate": 6.0076799778845105e-06, + "loss": 0.3024, + "step": 1170 + }, + { + "epoch": 0.45, + "grad_norm": 5.02658686052961, + "learning_rate": 6.001544558430527e-06, + "loss": 0.1995, + "step": 1171 + }, + { + "epoch": 0.45, + "grad_norm": 6.890567611341687, + "learning_rate": 5.99540756747468e-06, + "loss": 0.3806, + "step": 1172 + }, + { + "epoch": 0.45, + "grad_norm": 3.8223830234142473, + "learning_rate": 5.989269014646393e-06, + "loss": 0.2737, + "step": 1173 + }, + { + "epoch": 0.45, + "grad_norm": 4.429044083910661, + "learning_rate": 5.983128909577532e-06, + "loss": 0.2405, + "step": 1174 + }, + { + "epoch": 0.45, + "grad_norm": 3.261424803528859, + "learning_rate": 5.976987261902403e-06, + "loss": 0.247, + "step": 1175 + }, + { + "epoch": 0.45, + "grad_norm": 5.973107952087142, + "learning_rate": 5.970844081257734e-06, + "loss": 0.4896, + "step": 1176 + }, + { + "epoch": 0.46, + "grad_norm": 6.580671442780074, + "learning_rate": 5.964699377282657e-06, + "loss": 0.5314, + "step": 1177 + }, + { + "epoch": 0.46, + "grad_norm": 6.4169154107952195, + "learning_rate": 5.958553159618693e-06, + "loss": 0.4399, + "step": 1178 + }, + { + "epoch": 0.46, + "grad_norm": 5.0613887064501135, + "learning_rate": 5.952405437909738e-06, + "loss": 0.6028, + "step": 1179 + }, + { + "epoch": 0.46, + "grad_norm": 4.006412042006303, + "learning_rate": 5.946256221802052e-06, + "loss": 0.3751, + "step": 1180 + }, + { + "epoch": 0.46, + "grad_norm": 5.3575450429058185, + "learning_rate": 5.940105520944232e-06, + "loss": 0.4863, + "step": 1181 + }, + { + "epoch": 0.46, + "grad_norm": 4.024201890669275, + "learning_rate": 5.933953344987215e-06, + "loss": 0.5701, + "step": 1182 + }, + { + "epoch": 0.46, + "grad_norm": 5.93172089619764, + "learning_rate": 5.927799703584242e-06, + "loss": 0.5711, + "step": 1183 + }, + { + "epoch": 0.46, + "grad_norm": 3.534927950024668, + "learning_rate": 5.921644606390862e-06, + "loss": 0.3593, + "step": 1184 + }, + { + "epoch": 0.46, + "grad_norm": 5.060085078207586, + "learning_rate": 5.915488063064902e-06, + "loss": 0.2987, + "step": 1185 + }, + { + "epoch": 0.46, + "grad_norm": 4.934505560678856, + "learning_rate": 5.9093300832664625e-06, + "loss": 0.4388, + "step": 1186 + }, + { + "epoch": 0.46, + "grad_norm": 4.442592625568581, + "learning_rate": 5.903170676657894e-06, + "loss": 0.1431, + "step": 1187 + }, + { + "epoch": 0.46, + "grad_norm": 3.3185934657339384, + "learning_rate": 5.897009852903792e-06, + "loss": 0.2223, + "step": 1188 + }, + { + "epoch": 0.46, + "grad_norm": 4.734216767611118, + "learning_rate": 5.890847621670966e-06, + "loss": 0.1683, + "step": 1189 + }, + { + "epoch": 0.46, + "grad_norm": 4.865184566691955, + "learning_rate": 5.8846839926284435e-06, + "loss": 0.6131, + "step": 1190 + }, + { + "epoch": 0.46, + "grad_norm": 5.565513874168808, + "learning_rate": 5.878518975447439e-06, + "loss": 0.5469, + "step": 1191 + }, + { + "epoch": 0.46, + "grad_norm": 7.064162651208678, + "learning_rate": 5.872352579801351e-06, + "loss": 0.4186, + "step": 1192 + }, + { + "epoch": 0.46, + "grad_norm": 7.0208368031220365, + "learning_rate": 5.866184815365733e-06, + "loss": 0.1921, + "step": 1193 + }, + { + "epoch": 0.46, + "grad_norm": 5.248481097308518, + "learning_rate": 5.860015691818292e-06, + "loss": 0.5729, + "step": 1194 + }, + { + "epoch": 0.46, + "grad_norm": 4.049651461983444, + "learning_rate": 5.853845218838868e-06, + "loss": 0.1473, + "step": 1195 + }, + { + "epoch": 0.46, + "grad_norm": 4.609426453789442, + "learning_rate": 5.847673406109413e-06, + "loss": 0.1931, + "step": 1196 + }, + { + "epoch": 0.46, + "grad_norm": 4.196204602326466, + "learning_rate": 5.841500263313987e-06, + "loss": 0.5721, + "step": 1197 + }, + { + "epoch": 0.46, + "grad_norm": 7.011308483929049, + "learning_rate": 5.835325800138736e-06, + "loss": 0.4812, + "step": 1198 + }, + { + "epoch": 0.46, + "grad_norm": 3.405669496225648, + "learning_rate": 5.829150026271871e-06, + "loss": 0.1026, + "step": 1199 + }, + { + "epoch": 0.46, + "grad_norm": 6.433134817157713, + "learning_rate": 5.82297295140367e-06, + "loss": 0.4697, + "step": 1200 + }, + { + "epoch": 0.46, + "grad_norm": 4.247143845938337, + "learning_rate": 5.816794585226445e-06, + "loss": 0.4352, + "step": 1201 + }, + { + "epoch": 0.46, + "grad_norm": 4.019507920925523, + "learning_rate": 5.810614937434537e-06, + "loss": 0.2723, + "step": 1202 + }, + { + "epoch": 0.47, + "grad_norm": 5.467886743770641, + "learning_rate": 5.8044340177242995e-06, + "loss": 0.2454, + "step": 1203 + }, + { + "epoch": 0.47, + "grad_norm": 4.169152841005073, + "learning_rate": 5.7982518357940755e-06, + "loss": 0.4425, + "step": 1204 + }, + { + "epoch": 0.47, + "grad_norm": 4.067046748413299, + "learning_rate": 5.792068401344195e-06, + "loss": 0.3379, + "step": 1205 + }, + { + "epoch": 0.47, + "grad_norm": 6.89229424685211, + "learning_rate": 5.78588372407695e-06, + "loss": 0.4855, + "step": 1206 + }, + { + "epoch": 0.47, + "grad_norm": 6.432031432757888, + "learning_rate": 5.779697813696587e-06, + "loss": 0.4653, + "step": 1207 + }, + { + "epoch": 0.47, + "grad_norm": 7.166187334895502, + "learning_rate": 5.77351067990928e-06, + "loss": 0.3022, + "step": 1208 + }, + { + "epoch": 0.47, + "grad_norm": 4.566813188099879, + "learning_rate": 5.767322332423128e-06, + "loss": 0.3696, + "step": 1209 + }, + { + "epoch": 0.47, + "grad_norm": 4.873947643336162, + "learning_rate": 5.761132780948132e-06, + "loss": 0.4058, + "step": 1210 + }, + { + "epoch": 0.47, + "grad_norm": 4.8649685765666915, + "learning_rate": 5.7549420351961845e-06, + "loss": 0.2907, + "step": 1211 + }, + { + "epoch": 0.47, + "grad_norm": 2.972624031864407, + "learning_rate": 5.748750104881051e-06, + "loss": 0.0882, + "step": 1212 + }, + { + "epoch": 0.47, + "grad_norm": 4.629809020290339, + "learning_rate": 5.742556999718353e-06, + "loss": 0.4197, + "step": 1213 + }, + { + "epoch": 0.47, + "grad_norm": 4.019146007779661, + "learning_rate": 5.736362729425558e-06, + "loss": 0.4386, + "step": 1214 + }, + { + "epoch": 0.47, + "grad_norm": 4.879325619503961, + "learning_rate": 5.730167303721963e-06, + "loss": 0.2226, + "step": 1215 + }, + { + "epoch": 0.47, + "grad_norm": 5.302162364705677, + "learning_rate": 5.723970732328675e-06, + "loss": 0.3707, + "step": 1216 + }, + { + "epoch": 0.47, + "grad_norm": 7.693996901628024, + "learning_rate": 5.717773024968602e-06, + "loss": 0.2543, + "step": 1217 + }, + { + "epoch": 0.47, + "grad_norm": 5.45242990400126, + "learning_rate": 5.711574191366427e-06, + "loss": 0.2894, + "step": 1218 + }, + { + "epoch": 0.47, + "grad_norm": 5.69524047261105, + "learning_rate": 5.70537424124861e-06, + "loss": 0.4739, + "step": 1219 + }, + { + "epoch": 0.47, + "grad_norm": 10.42156011111522, + "learning_rate": 5.699173184343357e-06, + "loss": 0.3222, + "step": 1220 + }, + { + "epoch": 0.47, + "grad_norm": 3.5591476661146224, + "learning_rate": 5.692971030380613e-06, + "loss": 0.5504, + "step": 1221 + }, + { + "epoch": 0.47, + "grad_norm": 4.404897507050902, + "learning_rate": 5.686767789092041e-06, + "loss": 0.3497, + "step": 1222 + }, + { + "epoch": 0.47, + "grad_norm": 5.140349965762885, + "learning_rate": 5.6805634702110155e-06, + "loss": 0.4362, + "step": 1223 + }, + { + "epoch": 0.47, + "grad_norm": 3.6771882068304116, + "learning_rate": 5.674358083472598e-06, + "loss": 0.1782, + "step": 1224 + }, + { + "epoch": 0.47, + "grad_norm": 3.289961142006053, + "learning_rate": 5.668151638613524e-06, + "loss": 0.0961, + "step": 1225 + }, + { + "epoch": 0.47, + "grad_norm": 5.210728581639879, + "learning_rate": 5.661944145372193e-06, + "loss": 0.4107, + "step": 1226 + }, + { + "epoch": 0.47, + "grad_norm": 9.538860034888927, + "learning_rate": 5.655735613488651e-06, + "loss": 0.6956, + "step": 1227 + }, + { + "epoch": 0.47, + "grad_norm": 5.972687849652108, + "learning_rate": 5.649526052704567e-06, + "loss": 0.2778, + "step": 1228 + }, + { + "epoch": 0.48, + "grad_norm": 5.286073873714111, + "learning_rate": 5.643315472763229e-06, + "loss": 0.6749, + "step": 1229 + }, + { + "epoch": 0.48, + "grad_norm": 4.354439211123255, + "learning_rate": 5.637103883409525e-06, + "loss": 0.2811, + "step": 1230 + }, + { + "epoch": 0.48, + "grad_norm": 4.169129954895993, + "learning_rate": 5.630891294389923e-06, + "loss": 0.1283, + "step": 1231 + }, + { + "epoch": 0.48, + "grad_norm": 4.748502550405167, + "learning_rate": 5.624677715452465e-06, + "loss": 0.3549, + "step": 1232 + }, + { + "epoch": 0.48, + "grad_norm": 5.0941336491437115, + "learning_rate": 5.61846315634674e-06, + "loss": 0.3485, + "step": 1233 + }, + { + "epoch": 0.48, + "grad_norm": 3.531285950464142, + "learning_rate": 5.612247626823878e-06, + "loss": 0.3336, + "step": 1234 + }, + { + "epoch": 0.48, + "grad_norm": 6.1672818038900274, + "learning_rate": 5.606031136636534e-06, + "loss": 0.4179, + "step": 1235 + }, + { + "epoch": 0.48, + "grad_norm": 4.861177099264431, + "learning_rate": 5.599813695538866e-06, + "loss": 0.1578, + "step": 1236 + }, + { + "epoch": 0.48, + "grad_norm": 3.969074014830787, + "learning_rate": 5.593595313286526e-06, + "loss": 0.3372, + "step": 1237 + }, + { + "epoch": 0.48, + "grad_norm": 6.087763165405732, + "learning_rate": 5.587375999636645e-06, + "loss": 0.6821, + "step": 1238 + }, + { + "epoch": 0.48, + "grad_norm": 4.68834589702857, + "learning_rate": 5.581155764347812e-06, + "loss": 0.4139, + "step": 1239 + }, + { + "epoch": 0.48, + "grad_norm": 5.766295320914873, + "learning_rate": 5.574934617180063e-06, + "loss": 0.6889, + "step": 1240 + }, + { + "epoch": 0.48, + "grad_norm": 5.109333746550744, + "learning_rate": 5.568712567894866e-06, + "loss": 0.3811, + "step": 1241 + }, + { + "epoch": 0.48, + "grad_norm": 3.9414847431989393, + "learning_rate": 5.562489626255104e-06, + "loss": 0.1047, + "step": 1242 + }, + { + "epoch": 0.48, + "grad_norm": 5.386540890896379, + "learning_rate": 5.5562658020250585e-06, + "loss": 0.2444, + "step": 1243 + }, + { + "epoch": 0.48, + "grad_norm": 3.735938555005505, + "learning_rate": 5.550041104970398e-06, + "loss": 0.2727, + "step": 1244 + }, + { + "epoch": 0.48, + "grad_norm": 5.925222500287141, + "learning_rate": 5.5438155448581585e-06, + "loss": 0.5884, + "step": 1245 + }, + { + "epoch": 0.48, + "grad_norm": 4.740961407977184, + "learning_rate": 5.5375891314567335e-06, + "loss": 0.397, + "step": 1246 + }, + { + "epoch": 0.48, + "grad_norm": 7.306381654811139, + "learning_rate": 5.531361874535853e-06, + "loss": 0.5472, + "step": 1247 + }, + { + "epoch": 0.48, + "grad_norm": 4.947520287156723, + "learning_rate": 5.52513378386657e-06, + "loss": 0.2943, + "step": 1248 + }, + { + "epoch": 0.48, + "grad_norm": 4.3047376384719405, + "learning_rate": 5.518904869221245e-06, + "loss": 0.5119, + "step": 1249 + }, + { + "epoch": 0.48, + "grad_norm": 3.537957792999452, + "learning_rate": 5.512675140373537e-06, + "loss": 0.3736, + "step": 1250 + }, + { + "epoch": 0.48, + "grad_norm": 3.697394404742845, + "learning_rate": 5.506444607098375e-06, + "loss": 0.2577, + "step": 1251 + }, + { + "epoch": 0.48, + "grad_norm": 3.7575481500881733, + "learning_rate": 5.500213279171955e-06, + "loss": 0.2839, + "step": 1252 + }, + { + "epoch": 0.48, + "grad_norm": 5.325080892169685, + "learning_rate": 5.49398116637172e-06, + "loss": 0.5195, + "step": 1253 + }, + { + "epoch": 0.48, + "grad_norm": 4.310682333635457, + "learning_rate": 5.487748278476342e-06, + "loss": 0.3729, + "step": 1254 + }, + { + "epoch": 0.49, + "grad_norm": 4.542306776762291, + "learning_rate": 5.481514625265709e-06, + "loss": 0.3621, + "step": 1255 + }, + { + "epoch": 0.49, + "grad_norm": 5.240042011769809, + "learning_rate": 5.475280216520913e-06, + "loss": 0.2586, + "step": 1256 + }, + { + "epoch": 0.49, + "grad_norm": 3.3475441815800413, + "learning_rate": 5.469045062024231e-06, + "loss": 0.3936, + "step": 1257 + }, + { + "epoch": 0.49, + "grad_norm": 6.444550567362665, + "learning_rate": 5.462809171559104e-06, + "loss": 0.4832, + "step": 1258 + }, + { + "epoch": 0.49, + "grad_norm": 4.49277554364756, + "learning_rate": 5.456572554910137e-06, + "loss": 0.2727, + "step": 1259 + }, + { + "epoch": 0.49, + "grad_norm": 5.964218374207385, + "learning_rate": 5.450335221863068e-06, + "loss": 0.3893, + "step": 1260 + }, + { + "epoch": 0.49, + "grad_norm": 4.8200888851705495, + "learning_rate": 5.444097182204762e-06, + "loss": 0.251, + "step": 1261 + }, + { + "epoch": 0.49, + "grad_norm": 5.843886546917367, + "learning_rate": 5.437858445723191e-06, + "loss": 0.6017, + "step": 1262 + }, + { + "epoch": 0.49, + "grad_norm": 4.582756610616576, + "learning_rate": 5.431619022207422e-06, + "loss": 0.1755, + "step": 1263 + }, + { + "epoch": 0.49, + "grad_norm": 6.8065586351134115, + "learning_rate": 5.4253789214475975e-06, + "loss": 0.4714, + "step": 1264 + }, + { + "epoch": 0.49, + "grad_norm": 4.348854201133222, + "learning_rate": 5.4191381532349264e-06, + "loss": 0.0689, + "step": 1265 + }, + { + "epoch": 0.49, + "grad_norm": 4.308690605055334, + "learning_rate": 5.412896727361663e-06, + "loss": 0.2572, + "step": 1266 + }, + { + "epoch": 0.49, + "grad_norm": 5.8132512454305925, + "learning_rate": 5.406654653621092e-06, + "loss": 0.5671, + "step": 1267 + }, + { + "epoch": 0.49, + "grad_norm": 3.7302006434461283, + "learning_rate": 5.400411941807516e-06, + "loss": 0.6295, + "step": 1268 + }, + { + "epoch": 0.49, + "grad_norm": 4.79180495036863, + "learning_rate": 5.39416860171624e-06, + "loss": 0.3222, + "step": 1269 + }, + { + "epoch": 0.49, + "grad_norm": 4.133468204234008, + "learning_rate": 5.387924643143553e-06, + "loss": 0.2101, + "step": 1270 + }, + { + "epoch": 0.49, + "grad_norm": 3.7855426839085893, + "learning_rate": 5.381680075886716e-06, + "loss": 0.4163, + "step": 1271 + }, + { + "epoch": 0.49, + "grad_norm": 3.7767304290598305, + "learning_rate": 5.375434909743942e-06, + "loss": 0.0507, + "step": 1272 + }, + { + "epoch": 0.49, + "grad_norm": 2.8935497578214058, + "learning_rate": 5.369189154514388e-06, + "loss": 0.0535, + "step": 1273 + }, + { + "epoch": 0.49, + "grad_norm": 3.1298562133641754, + "learning_rate": 5.362942819998131e-06, + "loss": 0.4245, + "step": 1274 + }, + { + "epoch": 0.49, + "grad_norm": 3.7844454000946315, + "learning_rate": 5.356695915996162e-06, + "loss": 0.3826, + "step": 1275 + }, + { + "epoch": 0.49, + "grad_norm": 4.0834238707287165, + "learning_rate": 5.350448452310362e-06, + "loss": 0.4891, + "step": 1276 + }, + { + "epoch": 0.49, + "grad_norm": 6.304986818984701, + "learning_rate": 5.344200438743489e-06, + "loss": 0.5352, + "step": 1277 + }, + { + "epoch": 0.49, + "grad_norm": 5.033241097326988, + "learning_rate": 5.337951885099167e-06, + "loss": 0.6091, + "step": 1278 + }, + { + "epoch": 0.49, + "grad_norm": 5.301711417656752, + "learning_rate": 5.3317028011818635e-06, + "loss": 0.2551, + "step": 1279 + }, + { + "epoch": 0.49, + "grad_norm": 4.215987643808079, + "learning_rate": 5.3254531967968845e-06, + "loss": 0.3335, + "step": 1280 + }, + { + "epoch": 0.5, + "grad_norm": 3.966519649083018, + "learning_rate": 5.319203081750348e-06, + "loss": 0.295, + "step": 1281 + }, + { + "epoch": 0.5, + "grad_norm": 5.46701738285164, + "learning_rate": 5.312952465849173e-06, + "loss": 0.4782, + "step": 1282 + }, + { + "epoch": 0.5, + "grad_norm": 5.297541144411482, + "learning_rate": 5.306701358901066e-06, + "loss": 0.3318, + "step": 1283 + }, + { + "epoch": 0.5, + "grad_norm": 3.569942596811918, + "learning_rate": 5.300449770714502e-06, + "loss": 0.1394, + "step": 1284 + }, + { + "epoch": 0.5, + "grad_norm": 4.976107483882604, + "learning_rate": 5.294197711098716e-06, + "loss": 0.1441, + "step": 1285 + }, + { + "epoch": 0.5, + "grad_norm": 4.264637322106505, + "learning_rate": 5.287945189863676e-06, + "loss": 0.2133, + "step": 1286 + }, + { + "epoch": 0.5, + "grad_norm": 3.6591815017274834, + "learning_rate": 5.281692216820078e-06, + "loss": 0.3583, + "step": 1287 + }, + { + "epoch": 0.5, + "grad_norm": 5.76105833314619, + "learning_rate": 5.275438801779328e-06, + "loss": 0.4813, + "step": 1288 + }, + { + "epoch": 0.5, + "grad_norm": 3.635897240645958, + "learning_rate": 5.269184954553522e-06, + "loss": 0.2132, + "step": 1289 + }, + { + "epoch": 0.5, + "grad_norm": 3.8099601746613936, + "learning_rate": 5.262930684955439e-06, + "loss": 0.2209, + "step": 1290 + }, + { + "epoch": 0.5, + "grad_norm": 4.100196409365574, + "learning_rate": 5.2566760027985166e-06, + "loss": 0.4084, + "step": 1291 + }, + { + "epoch": 0.5, + "grad_norm": 4.165643243140668, + "learning_rate": 5.2504209178968395e-06, + "loss": 0.4816, + "step": 1292 + }, + { + "epoch": 0.5, + "grad_norm": 4.703730212664933, + "learning_rate": 5.2441654400651255e-06, + "loss": 0.283, + "step": 1293 + }, + { + "epoch": 0.5, + "grad_norm": 4.613521938704651, + "learning_rate": 5.237909579118713e-06, + "loss": 0.4435, + "step": 1294 + }, + { + "epoch": 0.5, + "grad_norm": 5.998155482772728, + "learning_rate": 5.231653344873534e-06, + "loss": 0.4067, + "step": 1295 + }, + { + "epoch": 0.5, + "grad_norm": 4.592365604412431, + "learning_rate": 5.225396747146112e-06, + "loss": 0.4533, + "step": 1296 + }, + { + "epoch": 0.5, + "grad_norm": 2.671757389339714, + "learning_rate": 5.219139795753539e-06, + "loss": 0.2443, + "step": 1297 + }, + { + "epoch": 0.5, + "grad_norm": 4.003199894047307, + "learning_rate": 5.212882500513462e-06, + "loss": 0.3811, + "step": 1298 + }, + { + "epoch": 0.5, + "grad_norm": 4.798964858875185, + "learning_rate": 5.206624871244066e-06, + "loss": 0.4514, + "step": 1299 + }, + { + "epoch": 0.5, + "grad_norm": 4.548843252426148, + "learning_rate": 5.200366917764062e-06, + "loss": 0.5154, + "step": 1300 + }, + { + "epoch": 0.5, + "grad_norm": 5.1170905572408065, + "learning_rate": 5.194108649892672e-06, + "loss": 0.4039, + "step": 1301 + }, + { + "epoch": 0.5, + "grad_norm": 4.527101598475088, + "learning_rate": 5.187850077449604e-06, + "loss": 0.3006, + "step": 1302 + }, + { + "epoch": 0.5, + "grad_norm": 4.887831108593034, + "learning_rate": 5.181591210255051e-06, + "loss": 0.3493, + "step": 1303 + }, + { + "epoch": 0.5, + "grad_norm": 4.4220653913357, + "learning_rate": 5.175332058129664e-06, + "loss": 0.5213, + "step": 1304 + }, + { + "epoch": 0.5, + "grad_norm": 4.949112894197746, + "learning_rate": 5.169072630894545e-06, + "loss": 0.4652, + "step": 1305 + }, + { + "epoch": 0.5, + "grad_norm": 3.5970766089444446, + "learning_rate": 5.162812938371226e-06, + "loss": 0.3941, + "step": 1306 + }, + { + "epoch": 0.51, + "grad_norm": 4.600667656635013, + "learning_rate": 5.1565529903816525e-06, + "loss": 0.2357, + "step": 1307 + }, + { + "epoch": 0.51, + "grad_norm": 4.603442867822164, + "learning_rate": 5.150292796748174e-06, + "loss": 0.5449, + "step": 1308 + }, + { + "epoch": 0.51, + "grad_norm": 4.572907471592153, + "learning_rate": 5.144032367293525e-06, + "loss": 0.3549, + "step": 1309 + }, + { + "epoch": 0.51, + "grad_norm": 4.918967329511568, + "learning_rate": 5.137771711840811e-06, + "loss": 0.1822, + "step": 1310 + }, + { + "epoch": 0.51, + "grad_norm": 4.751416230937269, + "learning_rate": 5.131510840213488e-06, + "loss": 0.1396, + "step": 1311 + }, + { + "epoch": 0.51, + "grad_norm": 5.127729907821657, + "learning_rate": 5.125249762235357e-06, + "loss": 0.3859, + "step": 1312 + }, + { + "epoch": 0.51, + "grad_norm": 4.276359069115821, + "learning_rate": 5.118988487730537e-06, + "loss": 0.5037, + "step": 1313 + }, + { + "epoch": 0.51, + "grad_norm": 8.559645832299882, + "learning_rate": 5.112727026523461e-06, + "loss": 0.3524, + "step": 1314 + }, + { + "epoch": 0.51, + "grad_norm": 5.098503918173065, + "learning_rate": 5.1064653884388515e-06, + "loss": 0.5115, + "step": 1315 + }, + { + "epoch": 0.51, + "grad_norm": 4.765279023013616, + "learning_rate": 5.100203583301706e-06, + "loss": 0.3172, + "step": 1316 + }, + { + "epoch": 0.51, + "grad_norm": 5.68916982314476, + "learning_rate": 5.0939416209372905e-06, + "loss": 0.364, + "step": 1317 + }, + { + "epoch": 0.51, + "grad_norm": 5.526477907946366, + "learning_rate": 5.087679511171113e-06, + "loss": 0.1592, + "step": 1318 + }, + { + "epoch": 0.51, + "grad_norm": 4.455813802201591, + "learning_rate": 5.081417263828914e-06, + "loss": 0.3091, + "step": 1319 + }, + { + "epoch": 0.51, + "grad_norm": 5.055962351478127, + "learning_rate": 5.075154888736653e-06, + "loss": 0.6518, + "step": 1320 + }, + { + "epoch": 0.51, + "grad_norm": 4.341974535809478, + "learning_rate": 5.068892395720482e-06, + "loss": 0.348, + "step": 1321 + }, + { + "epoch": 0.51, + "grad_norm": 4.614419414981455, + "learning_rate": 5.062629794606748e-06, + "loss": 0.5504, + "step": 1322 + }, + { + "epoch": 0.51, + "grad_norm": 5.004453011585351, + "learning_rate": 5.056367095221959e-06, + "loss": 0.4447, + "step": 1323 + }, + { + "epoch": 0.51, + "grad_norm": 4.619127552811418, + "learning_rate": 5.050104307392783e-06, + "loss": 0.2753, + "step": 1324 + }, + { + "epoch": 0.51, + "grad_norm": 4.568949512139795, + "learning_rate": 5.0438414409460234e-06, + "loss": 0.1231, + "step": 1325 + }, + { + "epoch": 0.51, + "grad_norm": 6.397955295331865, + "learning_rate": 5.03757850570861e-06, + "loss": 0.648, + "step": 1326 + }, + { + "epoch": 0.51, + "grad_norm": 9.44598650287468, + "learning_rate": 5.031315511507577e-06, + "loss": 0.4283, + "step": 1327 + }, + { + "epoch": 0.51, + "grad_norm": 4.832899160657488, + "learning_rate": 5.025052468170054e-06, + "loss": 0.3432, + "step": 1328 + }, + { + "epoch": 0.51, + "grad_norm": 3.8931850210303343, + "learning_rate": 5.018789385523245e-06, + "loss": 0.4084, + "step": 1329 + }, + { + "epoch": 0.51, + "grad_norm": 4.950788434997343, + "learning_rate": 5.01252627339442e-06, + "loss": 0.1616, + "step": 1330 + }, + { + "epoch": 0.51, + "grad_norm": 4.319062326100509, + "learning_rate": 5.006263141610891e-06, + "loss": 0.3678, + "step": 1331 + }, + { + "epoch": 0.51, + "grad_norm": 5.041192425356723, + "learning_rate": 5e-06, + "loss": 0.4499, + "step": 1332 + }, + { + "epoch": 0.52, + "grad_norm": 4.452899060490361, + "learning_rate": 4.993736858389111e-06, + "loss": 0.415, + "step": 1333 + }, + { + "epoch": 0.52, + "grad_norm": 3.8849610265370007, + "learning_rate": 4.987473726605581e-06, + "loss": 0.5768, + "step": 1334 + }, + { + "epoch": 0.52, + "grad_norm": 3.901606558710477, + "learning_rate": 4.981210614476755e-06, + "loss": 0.4216, + "step": 1335 + }, + { + "epoch": 0.52, + "grad_norm": 4.475995918212704, + "learning_rate": 4.974947531829947e-06, + "loss": 0.5771, + "step": 1336 + }, + { + "epoch": 0.52, + "grad_norm": 5.27343656664111, + "learning_rate": 4.968684488492424e-06, + "loss": 0.4872, + "step": 1337 + }, + { + "epoch": 0.52, + "grad_norm": 4.81995353853467, + "learning_rate": 4.9624214942913916e-06, + "loss": 0.3985, + "step": 1338 + }, + { + "epoch": 0.52, + "grad_norm": 5.106657369146505, + "learning_rate": 4.956158559053977e-06, + "loss": 0.3555, + "step": 1339 + }, + { + "epoch": 0.52, + "grad_norm": 3.9024337598370034, + "learning_rate": 4.949895692607218e-06, + "loss": 0.1435, + "step": 1340 + }, + { + "epoch": 0.52, + "grad_norm": 6.493152091605872, + "learning_rate": 4.9436329047780424e-06, + "loss": 0.2258, + "step": 1341 + }, + { + "epoch": 0.52, + "grad_norm": 5.034364669142676, + "learning_rate": 4.9373702053932534e-06, + "loss": 0.3212, + "step": 1342 + }, + { + "epoch": 0.52, + "grad_norm": 4.750046182270505, + "learning_rate": 4.9311076042795185e-06, + "loss": 0.4621, + "step": 1343 + }, + { + "epoch": 0.52, + "grad_norm": 4.365292555964028, + "learning_rate": 4.924845111263349e-06, + "loss": 0.1799, + "step": 1344 + }, + { + "epoch": 0.52, + "grad_norm": 5.040471548662068, + "learning_rate": 4.918582736171086e-06, + "loss": 0.3409, + "step": 1345 + }, + { + "epoch": 0.52, + "grad_norm": 3.13485029700616, + "learning_rate": 4.912320488828887e-06, + "loss": 0.1801, + "step": 1346 + }, + { + "epoch": 0.52, + "grad_norm": 4.574729597843415, + "learning_rate": 4.906058379062712e-06, + "loss": 0.4022, + "step": 1347 + }, + { + "epoch": 0.52, + "grad_norm": 5.276802780132034, + "learning_rate": 4.899796416698296e-06, + "loss": 0.6159, + "step": 1348 + }, + { + "epoch": 0.52, + "grad_norm": 4.6475497368492125, + "learning_rate": 4.893534611561152e-06, + "loss": 0.5064, + "step": 1349 + }, + { + "epoch": 0.52, + "grad_norm": 5.814418825204974, + "learning_rate": 4.88727297347654e-06, + "loss": 0.4989, + "step": 1350 + }, + { + "epoch": 0.52, + "grad_norm": 4.002754678951272, + "learning_rate": 4.881011512269464e-06, + "loss": 0.4045, + "step": 1351 + }, + { + "epoch": 0.52, + "grad_norm": 5.9827601091357865, + "learning_rate": 4.874750237764645e-06, + "loss": 0.2493, + "step": 1352 + }, + { + "epoch": 0.52, + "grad_norm": 3.693486262287033, + "learning_rate": 4.868489159786513e-06, + "loss": 0.444, + "step": 1353 + }, + { + "epoch": 0.52, + "grad_norm": 4.335688140712484, + "learning_rate": 4.862228288159191e-06, + "loss": 0.5152, + "step": 1354 + }, + { + "epoch": 0.52, + "grad_norm": 4.71196650068979, + "learning_rate": 4.8559676327064755e-06, + "loss": 0.4351, + "step": 1355 + }, + { + "epoch": 0.52, + "grad_norm": 6.674192796837757, + "learning_rate": 4.8497072032518274e-06, + "loss": 0.417, + "step": 1356 + }, + { + "epoch": 0.52, + "grad_norm": 4.649732752060125, + "learning_rate": 4.843447009618351e-06, + "loss": 0.3274, + "step": 1357 + }, + { + "epoch": 0.53, + "grad_norm": 4.3033211138913, + "learning_rate": 4.837187061628777e-06, + "loss": 0.1602, + "step": 1358 + }, + { + "epoch": 0.53, + "grad_norm": 4.315875668613648, + "learning_rate": 4.830927369105457e-06, + "loss": 0.3796, + "step": 1359 + }, + { + "epoch": 0.53, + "grad_norm": 4.685982332069527, + "learning_rate": 4.824667941870338e-06, + "loss": 0.4421, + "step": 1360 + }, + { + "epoch": 0.53, + "grad_norm": 5.347686458179461, + "learning_rate": 4.818408789744951e-06, + "loss": 0.3121, + "step": 1361 + }, + { + "epoch": 0.53, + "grad_norm": 4.657288387082717, + "learning_rate": 4.8121499225503974e-06, + "loss": 0.4413, + "step": 1362 + }, + { + "epoch": 0.53, + "grad_norm": 3.748923724907403, + "learning_rate": 4.8058913501073294e-06, + "loss": 0.4128, + "step": 1363 + }, + { + "epoch": 0.53, + "grad_norm": 5.405162615018604, + "learning_rate": 4.799633082235938e-06, + "loss": 0.3405, + "step": 1364 + }, + { + "epoch": 0.53, + "grad_norm": 5.414455172353855, + "learning_rate": 4.793375128755934e-06, + "loss": 0.3546, + "step": 1365 + }, + { + "epoch": 0.53, + "grad_norm": 4.736569171545471, + "learning_rate": 4.787117499486539e-06, + "loss": 0.3899, + "step": 1366 + }, + { + "epoch": 0.53, + "grad_norm": 4.884147814922411, + "learning_rate": 4.7808602042464615e-06, + "loss": 0.6129, + "step": 1367 + }, + { + "epoch": 0.53, + "grad_norm": 3.5321049022077795, + "learning_rate": 4.774603252853889e-06, + "loss": 0.1721, + "step": 1368 + }, + { + "epoch": 0.53, + "grad_norm": 5.500281378442747, + "learning_rate": 4.768346655126467e-06, + "loss": 0.5033, + "step": 1369 + }, + { + "epoch": 0.53, + "grad_norm": 5.718876105082406, + "learning_rate": 4.762090420881289e-06, + "loss": 0.1936, + "step": 1370 + }, + { + "epoch": 0.53, + "grad_norm": 5.199762316955981, + "learning_rate": 4.755834559934875e-06, + "loss": 0.284, + "step": 1371 + }, + { + "epoch": 0.53, + "grad_norm": 2.8804372991631624, + "learning_rate": 4.749579082103163e-06, + "loss": 0.2758, + "step": 1372 + }, + { + "epoch": 0.53, + "grad_norm": 3.9318027094172727, + "learning_rate": 4.743323997201485e-06, + "loss": 0.1626, + "step": 1373 + }, + { + "epoch": 0.53, + "grad_norm": 3.5867167821567083, + "learning_rate": 4.737069315044562e-06, + "loss": 0.1868, + "step": 1374 + }, + { + "epoch": 0.53, + "grad_norm": 4.514197990561089, + "learning_rate": 4.730815045446478e-06, + "loss": 0.3716, + "step": 1375 + }, + { + "epoch": 0.53, + "grad_norm": 4.776928401989964, + "learning_rate": 4.724561198220672e-06, + "loss": 0.3965, + "step": 1376 + }, + { + "epoch": 0.53, + "grad_norm": 3.9094633914677335, + "learning_rate": 4.718307783179924e-06, + "loss": 0.3339, + "step": 1377 + }, + { + "epoch": 0.53, + "grad_norm": 4.215323208202909, + "learning_rate": 4.712054810136327e-06, + "loss": 0.1818, + "step": 1378 + }, + { + "epoch": 0.53, + "grad_norm": 4.228300881993021, + "learning_rate": 4.705802288901286e-06, + "loss": 0.328, + "step": 1379 + }, + { + "epoch": 0.53, + "grad_norm": 5.464179689618592, + "learning_rate": 4.699550229285499e-06, + "loss": 0.6309, + "step": 1380 + }, + { + "epoch": 0.53, + "grad_norm": 3.678966528112276, + "learning_rate": 4.693298641098935e-06, + "loss": 0.1651, + "step": 1381 + }, + { + "epoch": 0.53, + "grad_norm": 5.459322859292966, + "learning_rate": 4.687047534150829e-06, + "loss": 0.3151, + "step": 1382 + }, + { + "epoch": 0.53, + "grad_norm": 6.2787024520816175, + "learning_rate": 4.680796918249653e-06, + "loss": 0.4516, + "step": 1383 + }, + { + "epoch": 0.54, + "grad_norm": 4.023343782044322, + "learning_rate": 4.674546803203116e-06, + "loss": 0.223, + "step": 1384 + }, + { + "epoch": 0.54, + "grad_norm": 4.09531271863508, + "learning_rate": 4.668297198818137e-06, + "loss": 0.1708, + "step": 1385 + }, + { + "epoch": 0.54, + "grad_norm": 4.754513599916183, + "learning_rate": 4.662048114900837e-06, + "loss": 0.1628, + "step": 1386 + }, + { + "epoch": 0.54, + "grad_norm": 4.220505886201745, + "learning_rate": 4.6557995612565146e-06, + "loss": 0.3616, + "step": 1387 + }, + { + "epoch": 0.54, + "grad_norm": 4.033642894202756, + "learning_rate": 4.649551547689641e-06, + "loss": 0.2401, + "step": 1388 + }, + { + "epoch": 0.54, + "grad_norm": 4.643153166140661, + "learning_rate": 4.643304084003839e-06, + "loss": 0.3787, + "step": 1389 + }, + { + "epoch": 0.54, + "grad_norm": 4.560528244548518, + "learning_rate": 4.6370571800018695e-06, + "loss": 0.1877, + "step": 1390 + }, + { + "epoch": 0.54, + "grad_norm": 4.388491608904832, + "learning_rate": 4.630810845485613e-06, + "loss": 0.1232, + "step": 1391 + }, + { + "epoch": 0.54, + "grad_norm": 5.93637784624128, + "learning_rate": 4.624565090256059e-06, + "loss": 0.365, + "step": 1392 + }, + { + "epoch": 0.54, + "grad_norm": 4.257432987702576, + "learning_rate": 4.618319924113286e-06, + "loss": 0.2224, + "step": 1393 + }, + { + "epoch": 0.54, + "grad_norm": 6.786647388241515, + "learning_rate": 4.612075356856447e-06, + "loss": 0.5707, + "step": 1394 + }, + { + "epoch": 0.54, + "grad_norm": 4.935423072133717, + "learning_rate": 4.60583139828376e-06, + "loss": 0.3992, + "step": 1395 + }, + { + "epoch": 0.54, + "grad_norm": 5.511290981704092, + "learning_rate": 4.599588058192487e-06, + "loss": 0.3921, + "step": 1396 + }, + { + "epoch": 0.54, + "grad_norm": 4.785240331651109, + "learning_rate": 4.5933453463789105e-06, + "loss": 0.6688, + "step": 1397 + }, + { + "epoch": 0.54, + "grad_norm": 4.561205508605427, + "learning_rate": 4.587103272638339e-06, + "loss": 0.3536, + "step": 1398 + }, + { + "epoch": 0.54, + "grad_norm": 5.352232431684483, + "learning_rate": 4.580861846765075e-06, + "loss": 0.6256, + "step": 1399 + }, + { + "epoch": 0.54, + "grad_norm": 4.446901089704637, + "learning_rate": 4.574621078552403e-06, + "loss": 0.6396, + "step": 1400 + }, + { + "epoch": 0.54, + "grad_norm": 6.3216933923128265, + "learning_rate": 4.568380977792581e-06, + "loss": 0.5249, + "step": 1401 + }, + { + "epoch": 0.54, + "grad_norm": 4.826445694399736, + "learning_rate": 4.562141554276811e-06, + "loss": 0.4093, + "step": 1402 + }, + { + "epoch": 0.54, + "grad_norm": 4.254819834764432, + "learning_rate": 4.555902817795239e-06, + "loss": 0.5174, + "step": 1403 + }, + { + "epoch": 0.54, + "grad_norm": 3.466656980375848, + "learning_rate": 4.549664778136933e-06, + "loss": 0.2572, + "step": 1404 + }, + { + "epoch": 0.54, + "grad_norm": 4.175437382426703, + "learning_rate": 4.543427445089863e-06, + "loss": 0.2028, + "step": 1405 + }, + { + "epoch": 0.54, + "grad_norm": 4.8911882705272, + "learning_rate": 4.537190828440898e-06, + "loss": 0.4679, + "step": 1406 + }, + { + "epoch": 0.54, + "grad_norm": 5.345626242449101, + "learning_rate": 4.5309549379757724e-06, + "loss": 0.2619, + "step": 1407 + }, + { + "epoch": 0.54, + "grad_norm": 5.47212229781788, + "learning_rate": 4.524719783479088e-06, + "loss": 0.4092, + "step": 1408 + }, + { + "epoch": 0.54, + "grad_norm": 5.3189583567005485, + "learning_rate": 4.5184853747342926e-06, + "loss": 0.3102, + "step": 1409 + }, + { + "epoch": 0.55, + "grad_norm": 4.748587805024686, + "learning_rate": 4.512251721523659e-06, + "loss": 0.4089, + "step": 1410 + }, + { + "epoch": 0.55, + "grad_norm": 4.137858866638523, + "learning_rate": 4.506018833628281e-06, + "loss": 0.104, + "step": 1411 + }, + { + "epoch": 0.55, + "grad_norm": 4.694425481506557, + "learning_rate": 4.499786720828046e-06, + "loss": 0.3668, + "step": 1412 + }, + { + "epoch": 0.55, + "grad_norm": 4.934987324623524, + "learning_rate": 4.493555392901626e-06, + "loss": 0.3352, + "step": 1413 + }, + { + "epoch": 0.55, + "grad_norm": 4.09958435430272, + "learning_rate": 4.487324859626465e-06, + "loss": 0.3388, + "step": 1414 + }, + { + "epoch": 0.55, + "grad_norm": 4.646619577266805, + "learning_rate": 4.481095130778756e-06, + "loss": 0.5508, + "step": 1415 + }, + { + "epoch": 0.55, + "grad_norm": 3.9639420627608435, + "learning_rate": 4.4748662161334335e-06, + "loss": 0.2911, + "step": 1416 + }, + { + "epoch": 0.55, + "grad_norm": 3.396224153912776, + "learning_rate": 4.4686381254641495e-06, + "loss": 0.0965, + "step": 1417 + }, + { + "epoch": 0.55, + "grad_norm": 3.283643875944138, + "learning_rate": 4.462410868543268e-06, + "loss": 0.3829, + "step": 1418 + }, + { + "epoch": 0.55, + "grad_norm": 3.688876838583003, + "learning_rate": 4.456184455141843e-06, + "loss": 0.1055, + "step": 1419 + }, + { + "epoch": 0.55, + "grad_norm": 4.134505921249341, + "learning_rate": 4.449958895029604e-06, + "loss": 0.1662, + "step": 1420 + }, + { + "epoch": 0.55, + "grad_norm": 4.357120548680949, + "learning_rate": 4.443734197974943e-06, + "loss": 0.5369, + "step": 1421 + }, + { + "epoch": 0.55, + "grad_norm": 4.264477593354643, + "learning_rate": 4.437510373744897e-06, + "loss": 0.3237, + "step": 1422 + }, + { + "epoch": 0.55, + "grad_norm": 3.6578273420833596, + "learning_rate": 4.431287432105134e-06, + "loss": 0.4314, + "step": 1423 + }, + { + "epoch": 0.55, + "grad_norm": 4.444778095296808, + "learning_rate": 4.4250653828199366e-06, + "loss": 0.5048, + "step": 1424 + }, + { + "epoch": 0.55, + "grad_norm": 4.096365587109272, + "learning_rate": 4.418844235652188e-06, + "loss": 0.1961, + "step": 1425 + }, + { + "epoch": 0.55, + "grad_norm": 4.145851994118955, + "learning_rate": 4.4126240003633565e-06, + "loss": 0.5091, + "step": 1426 + }, + { + "epoch": 0.55, + "grad_norm": 4.890069160290513, + "learning_rate": 4.4064046867134755e-06, + "loss": 0.573, + "step": 1427 + }, + { + "epoch": 0.55, + "grad_norm": 5.471820633417499, + "learning_rate": 4.400186304461136e-06, + "loss": 0.3534, + "step": 1428 + }, + { + "epoch": 0.55, + "grad_norm": 4.700061575505297, + "learning_rate": 4.393968863363468e-06, + "loss": 0.4357, + "step": 1429 + }, + { + "epoch": 0.55, + "grad_norm": 4.549335046345982, + "learning_rate": 4.387752373176123e-06, + "loss": 0.5236, + "step": 1430 + }, + { + "epoch": 0.55, + "grad_norm": 3.805485968388716, + "learning_rate": 4.381536843653262e-06, + "loss": 0.2868, + "step": 1431 + }, + { + "epoch": 0.55, + "grad_norm": 6.01988251960787, + "learning_rate": 4.375322284547536e-06, + "loss": 0.5035, + "step": 1432 + }, + { + "epoch": 0.55, + "grad_norm": 6.785660213357935, + "learning_rate": 4.369108705610077e-06, + "loss": 0.2794, + "step": 1433 + }, + { + "epoch": 0.55, + "grad_norm": 3.8903109542957437, + "learning_rate": 4.362896116590475e-06, + "loss": 0.3184, + "step": 1434 + }, + { + "epoch": 0.55, + "grad_norm": 3.9537539325999314, + "learning_rate": 4.356684527236773e-06, + "loss": 0.2546, + "step": 1435 + }, + { + "epoch": 0.56, + "grad_norm": 6.628936578239452, + "learning_rate": 4.3504739472954355e-06, + "loss": 0.3653, + "step": 1436 + }, + { + "epoch": 0.56, + "grad_norm": 4.881018316430561, + "learning_rate": 4.344264386511351e-06, + "loss": 0.1687, + "step": 1437 + }, + { + "epoch": 0.56, + "grad_norm": 3.9588487703461333, + "learning_rate": 4.3380558546278075e-06, + "loss": 0.3115, + "step": 1438 + }, + { + "epoch": 0.56, + "grad_norm": 5.701950011501446, + "learning_rate": 4.331848361386478e-06, + "loss": 0.5479, + "step": 1439 + }, + { + "epoch": 0.56, + "grad_norm": 3.710195317053987, + "learning_rate": 4.325641916527405e-06, + "loss": 0.2061, + "step": 1440 + }, + { + "epoch": 0.56, + "grad_norm": 4.741740338842962, + "learning_rate": 4.319436529788985e-06, + "loss": 0.3259, + "step": 1441 + }, + { + "epoch": 0.56, + "grad_norm": 5.145635596003466, + "learning_rate": 4.313232210907959e-06, + "loss": 0.2426, + "step": 1442 + }, + { + "epoch": 0.56, + "grad_norm": 4.446204265135104, + "learning_rate": 4.3070289696193886e-06, + "loss": 0.3982, + "step": 1443 + }, + { + "epoch": 0.56, + "grad_norm": 6.425547733704053, + "learning_rate": 4.3008268156566444e-06, + "loss": 0.5111, + "step": 1444 + }, + { + "epoch": 0.56, + "grad_norm": 4.33781990925659, + "learning_rate": 4.294625758751392e-06, + "loss": 0.3787, + "step": 1445 + }, + { + "epoch": 0.56, + "grad_norm": 4.103452942258401, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.3662, + "step": 1446 + }, + { + "epoch": 0.56, + "grad_norm": 4.0655623814372195, + "learning_rate": 4.282226975031402e-06, + "loss": 0.4652, + "step": 1447 + }, + { + "epoch": 0.56, + "grad_norm": 4.766225343491909, + "learning_rate": 4.276029267671327e-06, + "loss": 0.525, + "step": 1448 + }, + { + "epoch": 0.56, + "grad_norm": 3.472322802063309, + "learning_rate": 4.269832696278038e-06, + "loss": 0.131, + "step": 1449 + }, + { + "epoch": 0.56, + "grad_norm": 4.2210082573560435, + "learning_rate": 4.2636372705744425e-06, + "loss": 0.381, + "step": 1450 + }, + { + "epoch": 0.56, + "grad_norm": 4.068230476015941, + "learning_rate": 4.257443000281648e-06, + "loss": 0.1899, + "step": 1451 + }, + { + "epoch": 0.56, + "grad_norm": 4.062133989529346, + "learning_rate": 4.25124989511895e-06, + "loss": 0.2387, + "step": 1452 + }, + { + "epoch": 0.56, + "grad_norm": 4.253395716948605, + "learning_rate": 4.245057964803815e-06, + "loss": 0.3772, + "step": 1453 + }, + { + "epoch": 0.56, + "grad_norm": 5.066243021091036, + "learning_rate": 4.238867219051868e-06, + "loss": 0.2109, + "step": 1454 + }, + { + "epoch": 0.56, + "grad_norm": 4.31968234675491, + "learning_rate": 4.232677667576874e-06, + "loss": 0.3079, + "step": 1455 + }, + { + "epoch": 0.56, + "grad_norm": 4.9903737266786115, + "learning_rate": 4.226489320090723e-06, + "loss": 0.2489, + "step": 1456 + }, + { + "epoch": 0.56, + "grad_norm": 4.785627309087942, + "learning_rate": 4.220302186303416e-06, + "loss": 0.2637, + "step": 1457 + }, + { + "epoch": 0.56, + "grad_norm": 4.6660891871944035, + "learning_rate": 4.214116275923051e-06, + "loss": 0.3304, + "step": 1458 + }, + { + "epoch": 0.56, + "grad_norm": 3.552726695058103, + "learning_rate": 4.2079315986558056e-06, + "loss": 0.2224, + "step": 1459 + }, + { + "epoch": 0.56, + "grad_norm": 3.255849264152336, + "learning_rate": 4.201748164205925e-06, + "loss": 0.3666, + "step": 1460 + }, + { + "epoch": 0.56, + "grad_norm": 3.921409238431859, + "learning_rate": 4.195565982275701e-06, + "loss": 0.3434, + "step": 1461 + }, + { + "epoch": 0.57, + "grad_norm": 3.1791514628396063, + "learning_rate": 4.1893850625654626e-06, + "loss": 0.4433, + "step": 1462 + }, + { + "epoch": 0.57, + "grad_norm": 3.688767849030837, + "learning_rate": 4.183205414773555e-06, + "loss": 0.1771, + "step": 1463 + }, + { + "epoch": 0.57, + "grad_norm": 7.274330270883109, + "learning_rate": 4.17702704859633e-06, + "loss": 0.3412, + "step": 1464 + }, + { + "epoch": 0.57, + "grad_norm": 4.276379403881767, + "learning_rate": 4.1708499737281305e-06, + "loss": 0.2252, + "step": 1465 + }, + { + "epoch": 0.57, + "grad_norm": 4.4275714990157535, + "learning_rate": 4.1646741998612676e-06, + "loss": 0.3196, + "step": 1466 + }, + { + "epoch": 0.57, + "grad_norm": 5.805569818336066, + "learning_rate": 4.1584997366860145e-06, + "loss": 0.3324, + "step": 1467 + }, + { + "epoch": 0.57, + "grad_norm": 4.705265396538318, + "learning_rate": 4.1523265938905885e-06, + "loss": 0.213, + "step": 1468 + }, + { + "epoch": 0.57, + "grad_norm": 4.1938800793681095, + "learning_rate": 4.146154781161133e-06, + "loss": 0.4945, + "step": 1469 + }, + { + "epoch": 0.57, + "grad_norm": 2.0377205928657487, + "learning_rate": 4.1399843081817085e-06, + "loss": 0.0463, + "step": 1470 + }, + { + "epoch": 0.57, + "grad_norm": 3.3144372452006325, + "learning_rate": 4.133815184634269e-06, + "loss": 0.393, + "step": 1471 + }, + { + "epoch": 0.57, + "grad_norm": 3.017508208082044, + "learning_rate": 4.12764742019865e-06, + "loss": 0.2596, + "step": 1472 + }, + { + "epoch": 0.57, + "grad_norm": 3.7919045395811386, + "learning_rate": 4.121481024552561e-06, + "loss": 0.541, + "step": 1473 + }, + { + "epoch": 0.57, + "grad_norm": 4.95882206036411, + "learning_rate": 4.115316007371557e-06, + "loss": 0.4398, + "step": 1474 + }, + { + "epoch": 0.57, + "grad_norm": 4.66973771498853, + "learning_rate": 4.109152378329036e-06, + "loss": 0.3763, + "step": 1475 + }, + { + "epoch": 0.57, + "grad_norm": 5.847762996262314, + "learning_rate": 4.1029901470962105e-06, + "loss": 0.2938, + "step": 1476 + }, + { + "epoch": 0.57, + "grad_norm": 6.179071502240395, + "learning_rate": 4.0968293233421065e-06, + "loss": 0.1904, + "step": 1477 + }, + { + "epoch": 0.57, + "grad_norm": 3.9752752193041143, + "learning_rate": 4.090669916733539e-06, + "loss": 0.1964, + "step": 1478 + }, + { + "epoch": 0.57, + "grad_norm": 4.6355064008210185, + "learning_rate": 4.0845119369350995e-06, + "loss": 0.3628, + "step": 1479 + }, + { + "epoch": 0.57, + "grad_norm": 5.509494639492551, + "learning_rate": 4.078355393609139e-06, + "loss": 0.3887, + "step": 1480 + }, + { + "epoch": 0.57, + "grad_norm": 3.56076210034677, + "learning_rate": 4.0722002964157585e-06, + "loss": 0.3614, + "step": 1481 + }, + { + "epoch": 0.57, + "grad_norm": 5.470680502720224, + "learning_rate": 4.066046655012786e-06, + "loss": 0.2317, + "step": 1482 + }, + { + "epoch": 0.57, + "grad_norm": 5.997678909046692, + "learning_rate": 4.059894479055767e-06, + "loss": 0.3313, + "step": 1483 + }, + { + "epoch": 0.57, + "grad_norm": 3.0015515719649355, + "learning_rate": 4.053743778197951e-06, + "loss": 0.1241, + "step": 1484 + }, + { + "epoch": 0.57, + "grad_norm": 4.0533753005570246, + "learning_rate": 4.047594562090264e-06, + "loss": 0.2029, + "step": 1485 + }, + { + "epoch": 0.57, + "grad_norm": 6.129446225514946, + "learning_rate": 4.041446840381309e-06, + "loss": 0.3577, + "step": 1486 + }, + { + "epoch": 0.57, + "grad_norm": 5.693087869441793, + "learning_rate": 4.0353006227173455e-06, + "loss": 0.2564, + "step": 1487 + }, + { + "epoch": 0.58, + "grad_norm": 4.2698696515898, + "learning_rate": 4.029155918742268e-06, + "loss": 0.3401, + "step": 1488 + }, + { + "epoch": 0.58, + "grad_norm": 5.4125741124273725, + "learning_rate": 4.023012738097598e-06, + "loss": 0.4311, + "step": 1489 + }, + { + "epoch": 0.58, + "grad_norm": 4.510306762824228, + "learning_rate": 4.016871090422471e-06, + "loss": 0.2413, + "step": 1490 + }, + { + "epoch": 0.58, + "grad_norm": 4.219252553428604, + "learning_rate": 4.010730985353609e-06, + "loss": 0.3228, + "step": 1491 + }, + { + "epoch": 0.58, + "grad_norm": 3.9801080840084544, + "learning_rate": 4.00459243252532e-06, + "loss": 0.304, + "step": 1492 + }, + { + "epoch": 0.58, + "grad_norm": 5.5396212091755626, + "learning_rate": 3.998455441569473e-06, + "loss": 0.4879, + "step": 1493 + }, + { + "epoch": 0.58, + "grad_norm": 3.7772318325190013, + "learning_rate": 3.992320022115492e-06, + "loss": 0.4769, + "step": 1494 + }, + { + "epoch": 0.58, + "grad_norm": 6.0039394809010185, + "learning_rate": 3.986186183790323e-06, + "loss": 0.3172, + "step": 1495 + }, + { + "epoch": 0.58, + "grad_norm": 5.204364028635148, + "learning_rate": 3.980053936218444e-06, + "loss": 0.247, + "step": 1496 + }, + { + "epoch": 0.58, + "grad_norm": 4.317121569144144, + "learning_rate": 3.973923289021829e-06, + "loss": 0.3766, + "step": 1497 + }, + { + "epoch": 0.58, + "grad_norm": 4.998904443826016, + "learning_rate": 3.9677942518199465e-06, + "loss": 0.441, + "step": 1498 + }, + { + "epoch": 0.58, + "grad_norm": 5.433776021513012, + "learning_rate": 3.961666834229734e-06, + "loss": 0.3183, + "step": 1499 + }, + { + "epoch": 0.58, + "grad_norm": 5.369736004956909, + "learning_rate": 3.955541045865591e-06, + "loss": 0.3855, + "step": 1500 + }, + { + "epoch": 0.58, + "grad_norm": 2.9916271547290005, + "learning_rate": 3.949416896339357e-06, + "loss": 0.3071, + "step": 1501 + }, + { + "epoch": 0.58, + "grad_norm": 4.512835533817584, + "learning_rate": 3.943294395260305e-06, + "loss": 0.1732, + "step": 1502 + }, + { + "epoch": 0.58, + "grad_norm": 3.328324861730488, + "learning_rate": 3.937173552235117e-06, + "loss": 0.3571, + "step": 1503 + }, + { + "epoch": 0.58, + "grad_norm": 5.030903577474148, + "learning_rate": 3.931054376867877e-06, + "loss": 0.4652, + "step": 1504 + }, + { + "epoch": 0.58, + "grad_norm": 8.344547597819089, + "learning_rate": 3.924936878760048e-06, + "loss": 0.3512, + "step": 1505 + }, + { + "epoch": 0.58, + "grad_norm": 5.299986753165218, + "learning_rate": 3.918821067510464e-06, + "loss": 0.1322, + "step": 1506 + }, + { + "epoch": 0.58, + "grad_norm": 5.356397527437635, + "learning_rate": 3.912706952715314e-06, + "loss": 0.2978, + "step": 1507 + }, + { + "epoch": 0.58, + "grad_norm": 4.703673582358423, + "learning_rate": 3.906594543968122e-06, + "loss": 0.2282, + "step": 1508 + }, + { + "epoch": 0.58, + "grad_norm": 3.503063276257467, + "learning_rate": 3.900483850859735e-06, + "loss": 0.1066, + "step": 1509 + }, + { + "epoch": 0.58, + "grad_norm": 6.134104636570549, + "learning_rate": 3.89437488297831e-06, + "loss": 0.4441, + "step": 1510 + }, + { + "epoch": 0.58, + "grad_norm": 4.756484523844902, + "learning_rate": 3.888267649909296e-06, + "loss": 0.3926, + "step": 1511 + }, + { + "epoch": 0.58, + "grad_norm": 5.504780524924184, + "learning_rate": 3.882162161235421e-06, + "loss": 0.1757, + "step": 1512 + }, + { + "epoch": 0.58, + "grad_norm": 6.126572009653953, + "learning_rate": 3.876058426536674e-06, + "loss": 0.606, + "step": 1513 + }, + { + "epoch": 0.59, + "grad_norm": 4.663230747358417, + "learning_rate": 3.869956455390295e-06, + "loss": 0.4179, + "step": 1514 + }, + { + "epoch": 0.59, + "grad_norm": 4.181315471156965, + "learning_rate": 3.86385625737075e-06, + "loss": 0.289, + "step": 1515 + }, + { + "epoch": 0.59, + "grad_norm": 3.201227898458757, + "learning_rate": 3.857757842049732e-06, + "loss": 0.1346, + "step": 1516 + }, + { + "epoch": 0.59, + "grad_norm": 4.926969503211117, + "learning_rate": 3.851661218996129e-06, + "loss": 0.4248, + "step": 1517 + }, + { + "epoch": 0.59, + "grad_norm": 2.989317637241524, + "learning_rate": 3.845566397776022e-06, + "loss": 0.3161, + "step": 1518 + }, + { + "epoch": 0.59, + "grad_norm": 2.874726914919087, + "learning_rate": 3.839473387952662e-06, + "loss": 0.2798, + "step": 1519 + }, + { + "epoch": 0.59, + "grad_norm": 6.174614508967492, + "learning_rate": 3.833382199086459e-06, + "loss": 0.5569, + "step": 1520 + }, + { + "epoch": 0.59, + "grad_norm": 3.4966931392537384, + "learning_rate": 3.827292840734966e-06, + "loss": 0.2482, + "step": 1521 + }, + { + "epoch": 0.59, + "grad_norm": 3.9320863315443138, + "learning_rate": 3.821205322452863e-06, + "loss": 0.2117, + "step": 1522 + }, + { + "epoch": 0.59, + "grad_norm": 6.440310700744331, + "learning_rate": 3.815119653791943e-06, + "loss": 0.2702, + "step": 1523 + }, + { + "epoch": 0.59, + "grad_norm": 4.7478768889650524, + "learning_rate": 3.8090358443010993e-06, + "loss": 0.3356, + "step": 1524 + }, + { + "epoch": 0.59, + "grad_norm": 4.008056794967351, + "learning_rate": 3.802953903526301e-06, + "loss": 0.1924, + "step": 1525 + }, + { + "epoch": 0.59, + "grad_norm": 4.080002486180463, + "learning_rate": 3.796873841010591e-06, + "loss": 0.2048, + "step": 1526 + }, + { + "epoch": 0.59, + "grad_norm": 3.217585136256839, + "learning_rate": 3.7907956662940644e-06, + "loss": 0.0916, + "step": 1527 + }, + { + "epoch": 0.59, + "grad_norm": 3.8086105307397395, + "learning_rate": 3.784719388913853e-06, + "loss": 0.2909, + "step": 1528 + }, + { + "epoch": 0.59, + "grad_norm": 5.5725664063240155, + "learning_rate": 3.778645018404112e-06, + "loss": 0.4978, + "step": 1529 + }, + { + "epoch": 0.59, + "grad_norm": 6.3115117944969565, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.4703, + "step": 1530 + }, + { + "epoch": 0.59, + "grad_norm": 5.8455271216801306, + "learning_rate": 3.7665020361176872e-06, + "loss": 0.4041, + "step": 1531 + }, + { + "epoch": 0.59, + "grad_norm": 3.957077460576616, + "learning_rate": 3.760433443394294e-06, + "loss": 0.4267, + "step": 1532 + }, + { + "epoch": 0.59, + "grad_norm": 4.219325430622278, + "learning_rate": 3.754366795647923e-06, + "loss": 0.3815, + "step": 1533 + }, + { + "epoch": 0.59, + "grad_norm": 4.767149721252658, + "learning_rate": 3.748302102397618e-06, + "loss": 0.4733, + "step": 1534 + }, + { + "epoch": 0.59, + "grad_norm": 3.5153750892227036, + "learning_rate": 3.7422393731593586e-06, + "loss": 0.1929, + "step": 1535 + }, + { + "epoch": 0.59, + "grad_norm": 5.182170655134872, + "learning_rate": 3.7361786174460414e-06, + "loss": 0.2514, + "step": 1536 + }, + { + "epoch": 0.59, + "grad_norm": 4.401273986878207, + "learning_rate": 3.730119844767468e-06, + "loss": 0.1653, + "step": 1537 + }, + { + "epoch": 0.59, + "grad_norm": 3.9891550305249703, + "learning_rate": 3.7240630646303262e-06, + "loss": 0.4162, + "step": 1538 + }, + { + "epoch": 0.59, + "grad_norm": 6.475869932565687, + "learning_rate": 3.718008286538179e-06, + "loss": 0.3706, + "step": 1539 + }, + { + "epoch": 0.6, + "grad_norm": 7.624030179882473, + "learning_rate": 3.711955519991447e-06, + "loss": 0.2352, + "step": 1540 + }, + { + "epoch": 0.6, + "grad_norm": 5.4289749422225055, + "learning_rate": 3.705904774487396e-06, + "loss": 0.239, + "step": 1541 + }, + { + "epoch": 0.6, + "grad_norm": 5.672470473292984, + "learning_rate": 3.6998560595201188e-06, + "loss": 0.1951, + "step": 1542 + }, + { + "epoch": 0.6, + "grad_norm": 4.227877497623781, + "learning_rate": 3.6938093845805257e-06, + "loss": 0.4455, + "step": 1543 + }, + { + "epoch": 0.6, + "grad_norm": 4.585073856751496, + "learning_rate": 3.687764759156318e-06, + "loss": 0.2735, + "step": 1544 + }, + { + "epoch": 0.6, + "grad_norm": 4.731607106926894, + "learning_rate": 3.68172219273199e-06, + "loss": 0.3307, + "step": 1545 + }, + { + "epoch": 0.6, + "grad_norm": 4.404167408537416, + "learning_rate": 3.675681694788801e-06, + "loss": 0.3731, + "step": 1546 + }, + { + "epoch": 0.6, + "grad_norm": 4.515198173032717, + "learning_rate": 3.669643274804765e-06, + "loss": 0.188, + "step": 1547 + }, + { + "epoch": 0.6, + "grad_norm": 4.495410609475275, + "learning_rate": 3.6636069422546363e-06, + "loss": 0.2857, + "step": 1548 + }, + { + "epoch": 0.6, + "grad_norm": 5.37388058164263, + "learning_rate": 3.6575727066098936e-06, + "loss": 0.2787, + "step": 1549 + }, + { + "epoch": 0.6, + "grad_norm": 5.324020488687119, + "learning_rate": 3.6515405773387257e-06, + "loss": 0.4444, + "step": 1550 + }, + { + "epoch": 0.6, + "grad_norm": 5.0257406253270815, + "learning_rate": 3.645510563906014e-06, + "loss": 0.258, + "step": 1551 + }, + { + "epoch": 0.6, + "grad_norm": 3.370561128774498, + "learning_rate": 3.639482675773324e-06, + "loss": 0.1959, + "step": 1552 + }, + { + "epoch": 0.6, + "grad_norm": 4.2493296603566115, + "learning_rate": 3.633456922398887e-06, + "loss": 0.3543, + "step": 1553 + }, + { + "epoch": 0.6, + "grad_norm": 5.335576674837506, + "learning_rate": 3.627433313237576e-06, + "loss": 0.5657, + "step": 1554 + }, + { + "epoch": 0.6, + "grad_norm": 4.062024130226634, + "learning_rate": 3.621411857740908e-06, + "loss": 0.1434, + "step": 1555 + }, + { + "epoch": 0.6, + "grad_norm": 2.979363096325817, + "learning_rate": 3.6153925653570186e-06, + "loss": 0.4033, + "step": 1556 + }, + { + "epoch": 0.6, + "grad_norm": 4.71101568663758, + "learning_rate": 3.6093754455306495e-06, + "loss": 0.1745, + "step": 1557 + }, + { + "epoch": 0.6, + "grad_norm": 3.8585032404200943, + "learning_rate": 3.603360507703133e-06, + "loss": 0.4782, + "step": 1558 + }, + { + "epoch": 0.6, + "grad_norm": 4.784458526359172, + "learning_rate": 3.597347761312377e-06, + "loss": 0.5261, + "step": 1559 + }, + { + "epoch": 0.6, + "grad_norm": 3.843226432822785, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.3679, + "step": 1560 + }, + { + "epoch": 0.6, + "grad_norm": 6.966301117391833, + "learning_rate": 3.585328880575574e-06, + "loss": 0.3992, + "step": 1561 + }, + { + "epoch": 0.6, + "grad_norm": 8.367193452076943, + "learning_rate": 3.5793227650880928e-06, + "loss": 0.5085, + "step": 1562 + }, + { + "epoch": 0.6, + "grad_norm": 9.586351823066316, + "learning_rate": 3.573318878754475e-06, + "loss": 0.4149, + "step": 1563 + }, + { + "epoch": 0.6, + "grad_norm": 4.837885770381356, + "learning_rate": 3.5673172309952846e-06, + "loss": 0.3667, + "step": 1564 + }, + { + "epoch": 0.61, + "grad_norm": 3.695094278265337, + "learning_rate": 3.5613178312275796e-06, + "loss": 0.2563, + "step": 1565 + }, + { + "epoch": 0.61, + "grad_norm": 4.603957498733291, + "learning_rate": 3.555320688864889e-06, + "loss": 0.3894, + "step": 1566 + }, + { + "epoch": 0.61, + "grad_norm": 4.365398294496251, + "learning_rate": 3.549325813317197e-06, + "loss": 0.3296, + "step": 1567 + }, + { + "epoch": 0.61, + "grad_norm": 4.719689737508214, + "learning_rate": 3.5433332139909342e-06, + "loss": 0.2342, + "step": 1568 + }, + { + "epoch": 0.61, + "grad_norm": 4.083072422306283, + "learning_rate": 3.5373429002889583e-06, + "loss": 0.4403, + "step": 1569 + }, + { + "epoch": 0.61, + "grad_norm": 4.015949665717416, + "learning_rate": 3.531354881610539e-06, + "loss": 0.271, + "step": 1570 + }, + { + "epoch": 0.61, + "grad_norm": 3.561547003390101, + "learning_rate": 3.525369167351349e-06, + "loss": 0.3076, + "step": 1571 + }, + { + "epoch": 0.61, + "grad_norm": 3.618005155853049, + "learning_rate": 3.519385766903442e-06, + "loss": 0.2894, + "step": 1572 + }, + { + "epoch": 0.61, + "grad_norm": 4.0852155842053905, + "learning_rate": 3.5134046896552436e-06, + "loss": 0.3645, + "step": 1573 + }, + { + "epoch": 0.61, + "grad_norm": 5.439663902373668, + "learning_rate": 3.507425944991529e-06, + "loss": 0.4083, + "step": 1574 + }, + { + "epoch": 0.61, + "grad_norm": 5.057067586030128, + "learning_rate": 3.501449542293418e-06, + "loss": 0.4067, + "step": 1575 + }, + { + "epoch": 0.61, + "grad_norm": 3.1617977677922733, + "learning_rate": 3.495475490938355e-06, + "loss": 0.1281, + "step": 1576 + }, + { + "epoch": 0.61, + "grad_norm": 4.811102471895702, + "learning_rate": 3.4895038003000953e-06, + "loss": 0.5908, + "step": 1577 + }, + { + "epoch": 0.61, + "grad_norm": 4.033044332940408, + "learning_rate": 3.483534479748688e-06, + "loss": 0.4497, + "step": 1578 + }, + { + "epoch": 0.61, + "grad_norm": 6.015058932877953, + "learning_rate": 3.477567538650466e-06, + "loss": 0.4729, + "step": 1579 + }, + { + "epoch": 0.61, + "grad_norm": 3.4444982631762167, + "learning_rate": 3.4716029863680256e-06, + "loss": 0.3643, + "step": 1580 + }, + { + "epoch": 0.61, + "grad_norm": 4.638305557142188, + "learning_rate": 3.4656408322602168e-06, + "loss": 0.2885, + "step": 1581 + }, + { + "epoch": 0.61, + "grad_norm": 5.255174305001011, + "learning_rate": 3.4596810856821304e-06, + "loss": 0.3581, + "step": 1582 + }, + { + "epoch": 0.61, + "grad_norm": 5.523418614165512, + "learning_rate": 3.45372375598507e-06, + "loss": 0.3931, + "step": 1583 + }, + { + "epoch": 0.61, + "grad_norm": 3.9112513072493846, + "learning_rate": 3.447768852516554e-06, + "loss": 0.3582, + "step": 1584 + }, + { + "epoch": 0.61, + "grad_norm": 4.503925033573366, + "learning_rate": 3.4418163846202945e-06, + "loss": 0.2074, + "step": 1585 + }, + { + "epoch": 0.61, + "grad_norm": 5.17076176202868, + "learning_rate": 3.4358663616361775e-06, + "loss": 0.4885, + "step": 1586 + }, + { + "epoch": 0.61, + "grad_norm": 4.875859502345279, + "learning_rate": 3.429918792900257e-06, + "loss": 0.4738, + "step": 1587 + }, + { + "epoch": 0.61, + "grad_norm": 5.124966067519853, + "learning_rate": 3.4239736877447327e-06, + "loss": 0.327, + "step": 1588 + }, + { + "epoch": 0.61, + "grad_norm": 5.089392474409888, + "learning_rate": 3.4180310554979413e-06, + "loss": 0.3903, + "step": 1589 + }, + { + "epoch": 0.61, + "grad_norm": 4.857269282488471, + "learning_rate": 3.4120909054843375e-06, + "loss": 0.3071, + "step": 1590 + }, + { + "epoch": 0.62, + "grad_norm": 5.0690463549392355, + "learning_rate": 3.406153247024483e-06, + "loss": 0.2055, + "step": 1591 + }, + { + "epoch": 0.62, + "grad_norm": 3.4169111638971876, + "learning_rate": 3.400218089435029e-06, + "loss": 0.3195, + "step": 1592 + }, + { + "epoch": 0.62, + "grad_norm": 5.506896436639098, + "learning_rate": 3.3942854420287006e-06, + "loss": 0.6153, + "step": 1593 + }, + { + "epoch": 0.62, + "grad_norm": 4.323212175668684, + "learning_rate": 3.3883553141142884e-06, + "loss": 0.4096, + "step": 1594 + }, + { + "epoch": 0.62, + "grad_norm": 3.571988379752501, + "learning_rate": 3.3824277149966265e-06, + "loss": 0.2163, + "step": 1595 + }, + { + "epoch": 0.62, + "grad_norm": 3.925491898191029, + "learning_rate": 3.3765026539765832e-06, + "loss": 0.3347, + "step": 1596 + }, + { + "epoch": 0.62, + "grad_norm": 2.810934803829099, + "learning_rate": 3.3705801403510417e-06, + "loss": 0.2624, + "step": 1597 + }, + { + "epoch": 0.62, + "grad_norm": 5.13045526203993, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.4272, + "step": 1598 + }, + { + "epoch": 0.62, + "grad_norm": 5.679400999094655, + "learning_rate": 3.3587427924510086e-06, + "loss": 0.2084, + "step": 1599 + }, + { + "epoch": 0.62, + "grad_norm": 5.671096588916482, + "learning_rate": 3.352827976750242e-06, + "loss": 0.6803, + "step": 1600 + }, + { + "epoch": 0.62, + "grad_norm": 3.6355593379831377, + "learning_rate": 3.3469157455914013e-06, + "loss": 0.3139, + "step": 1601 + }, + { + "epoch": 0.62, + "grad_norm": 3.4716879424544707, + "learning_rate": 3.3410061082512422e-06, + "loss": 0.3129, + "step": 1602 + }, + { + "epoch": 0.62, + "grad_norm": 3.275693478616472, + "learning_rate": 3.335099074002445e-06, + "loss": 0.2482, + "step": 1603 + }, + { + "epoch": 0.62, + "grad_norm": 5.463750058107956, + "learning_rate": 3.32919465211361e-06, + "loss": 0.2431, + "step": 1604 + }, + { + "epoch": 0.62, + "grad_norm": 4.420931104299418, + "learning_rate": 3.323292851849238e-06, + "loss": 0.32, + "step": 1605 + }, + { + "epoch": 0.62, + "grad_norm": 3.5310184393281894, + "learning_rate": 3.3173936824697174e-06, + "loss": 0.2225, + "step": 1606 + }, + { + "epoch": 0.62, + "grad_norm": 4.898892610016629, + "learning_rate": 3.3114971532313058e-06, + "loss": 0.4428, + "step": 1607 + }, + { + "epoch": 0.62, + "grad_norm": 5.6983351425803095, + "learning_rate": 3.3056032733861188e-06, + "loss": 0.4583, + "step": 1608 + }, + { + "epoch": 0.62, + "grad_norm": 3.2872913356956315, + "learning_rate": 3.2997120521821168e-06, + "loss": 0.4078, + "step": 1609 + }, + { + "epoch": 0.62, + "grad_norm": 3.9667901873913753, + "learning_rate": 3.293823498863087e-06, + "loss": 0.1196, + "step": 1610 + }, + { + "epoch": 0.62, + "grad_norm": 3.444362037455661, + "learning_rate": 3.28793762266863e-06, + "loss": 0.1999, + "step": 1611 + }, + { + "epoch": 0.62, + "grad_norm": 4.1323845180596965, + "learning_rate": 3.2820544328341485e-06, + "loss": 0.2043, + "step": 1612 + }, + { + "epoch": 0.62, + "grad_norm": 4.669166522622902, + "learning_rate": 3.2761739385908264e-06, + "loss": 0.1308, + "step": 1613 + }, + { + "epoch": 0.62, + "grad_norm": 2.1742174942176495, + "learning_rate": 3.2702961491656197e-06, + "loss": 0.0665, + "step": 1614 + }, + { + "epoch": 0.62, + "grad_norm": 2.9569157853804575, + "learning_rate": 3.264421073781241e-06, + "loss": 0.3077, + "step": 1615 + }, + { + "epoch": 0.62, + "grad_norm": 4.188901513956453, + "learning_rate": 3.258548721656144e-06, + "loss": 0.1397, + "step": 1616 + }, + { + "epoch": 0.63, + "grad_norm": 5.478623580194558, + "learning_rate": 3.252679102004509e-06, + "loss": 0.407, + "step": 1617 + }, + { + "epoch": 0.63, + "grad_norm": 5.209800340962832, + "learning_rate": 3.2468122240362287e-06, + "loss": 0.522, + "step": 1618 + }, + { + "epoch": 0.63, + "grad_norm": 4.611061350039146, + "learning_rate": 3.240948096956894e-06, + "loss": 0.3241, + "step": 1619 + }, + { + "epoch": 0.63, + "grad_norm": 4.851662013093061, + "learning_rate": 3.2350867299677802e-06, + "loss": 0.4047, + "step": 1620 + }, + { + "epoch": 0.63, + "grad_norm": 4.4414185335000855, + "learning_rate": 3.2292281322658315e-06, + "loss": 0.6093, + "step": 1621 + }, + { + "epoch": 0.63, + "grad_norm": 9.256115339901157, + "learning_rate": 3.223372313043647e-06, + "loss": 0.4213, + "step": 1622 + }, + { + "epoch": 0.63, + "grad_norm": 7.060735569137316, + "learning_rate": 3.2175192814894627e-06, + "loss": 0.323, + "step": 1623 + }, + { + "epoch": 0.63, + "grad_norm": 3.2550775250574673, + "learning_rate": 3.211669046787147e-06, + "loss": 0.1256, + "step": 1624 + }, + { + "epoch": 0.63, + "grad_norm": 4.7634389316841625, + "learning_rate": 3.205821618116175e-06, + "loss": 0.3866, + "step": 1625 + }, + { + "epoch": 0.63, + "grad_norm": 3.887011889531534, + "learning_rate": 3.1999770046516198e-06, + "loss": 0.2934, + "step": 1626 + }, + { + "epoch": 0.63, + "grad_norm": 5.236055281555687, + "learning_rate": 3.194135215564139e-06, + "loss": 0.2454, + "step": 1627 + }, + { + "epoch": 0.63, + "grad_norm": 4.298067101862836, + "learning_rate": 3.188296260019956e-06, + "loss": 0.4734, + "step": 1628 + }, + { + "epoch": 0.63, + "grad_norm": 4.46461080053218, + "learning_rate": 3.1824601471808504e-06, + "loss": 0.1847, + "step": 1629 + }, + { + "epoch": 0.63, + "grad_norm": 2.6199644001707134, + "learning_rate": 3.1766268862041406e-06, + "loss": 0.1652, + "step": 1630 + }, + { + "epoch": 0.63, + "grad_norm": 4.916631814306934, + "learning_rate": 3.17079648624267e-06, + "loss": 0.1468, + "step": 1631 + }, + { + "epoch": 0.63, + "grad_norm": 3.5932787909655586, + "learning_rate": 3.164968956444791e-06, + "loss": 0.1869, + "step": 1632 + }, + { + "epoch": 0.63, + "grad_norm": 4.015353888738545, + "learning_rate": 3.159144305954356e-06, + "loss": 0.3621, + "step": 1633 + }, + { + "epoch": 0.63, + "grad_norm": 3.9500143540112194, + "learning_rate": 3.1533225439106965e-06, + "loss": 0.1496, + "step": 1634 + }, + { + "epoch": 0.63, + "grad_norm": 7.794321715793806, + "learning_rate": 3.1475036794486147e-06, + "loss": 0.3855, + "step": 1635 + }, + { + "epoch": 0.63, + "grad_norm": 3.440922482063624, + "learning_rate": 3.141687721698363e-06, + "loss": 0.1055, + "step": 1636 + }, + { + "epoch": 0.63, + "grad_norm": 3.6663959358882092, + "learning_rate": 3.1358746797856367e-06, + "loss": 0.422, + "step": 1637 + }, + { + "epoch": 0.63, + "grad_norm": 7.932315484073457, + "learning_rate": 3.130064562831553e-06, + "loss": 0.1827, + "step": 1638 + }, + { + "epoch": 0.63, + "grad_norm": 4.059351081158459, + "learning_rate": 3.1242573799526397e-06, + "loss": 0.3927, + "step": 1639 + }, + { + "epoch": 0.63, + "grad_norm": 4.642735050617873, + "learning_rate": 3.118453140260823e-06, + "loss": 0.2246, + "step": 1640 + }, + { + "epoch": 0.63, + "grad_norm": 5.914140625367253, + "learning_rate": 3.1126518528634096e-06, + "loss": 0.5902, + "step": 1641 + }, + { + "epoch": 0.63, + "grad_norm": 8.27481680069469, + "learning_rate": 3.106853526863073e-06, + "loss": 0.3063, + "step": 1642 + }, + { + "epoch": 0.64, + "grad_norm": 3.9534193599306784, + "learning_rate": 3.1010581713578403e-06, + "loss": 0.2117, + "step": 1643 + }, + { + "epoch": 0.64, + "grad_norm": 4.403822031964082, + "learning_rate": 3.0952657954410792e-06, + "loss": 0.3243, + "step": 1644 + }, + { + "epoch": 0.64, + "grad_norm": 5.17237295544702, + "learning_rate": 3.0894764082014805e-06, + "loss": 0.0755, + "step": 1645 + }, + { + "epoch": 0.64, + "grad_norm": 5.143751849182849, + "learning_rate": 3.0836900187230475e-06, + "loss": 0.3995, + "step": 1646 + }, + { + "epoch": 0.64, + "grad_norm": 4.080162008984293, + "learning_rate": 3.0779066360850774e-06, + "loss": 0.4505, + "step": 1647 + }, + { + "epoch": 0.64, + "grad_norm": 4.31575469159342, + "learning_rate": 3.072126269362151e-06, + "loss": 0.3163, + "step": 1648 + }, + { + "epoch": 0.64, + "grad_norm": 4.335292171405565, + "learning_rate": 3.0663489276241166e-06, + "loss": 0.5304, + "step": 1649 + }, + { + "epoch": 0.64, + "grad_norm": 4.486175151787324, + "learning_rate": 3.0605746199360755e-06, + "loss": 0.359, + "step": 1650 + }, + { + "epoch": 0.64, + "grad_norm": 4.27442481961856, + "learning_rate": 3.0548033553583707e-06, + "loss": 0.4114, + "step": 1651 + }, + { + "epoch": 0.64, + "grad_norm": 4.766822274738467, + "learning_rate": 3.049035142946565e-06, + "loss": 0.2047, + "step": 1652 + }, + { + "epoch": 0.64, + "grad_norm": 5.291066566094241, + "learning_rate": 3.0432699917514375e-06, + "loss": 0.1798, + "step": 1653 + }, + { + "epoch": 0.64, + "grad_norm": 6.140959022245619, + "learning_rate": 3.0375079108189613e-06, + "loss": 0.2744, + "step": 1654 + }, + { + "epoch": 0.64, + "grad_norm": 5.219552671471157, + "learning_rate": 3.0317489091902936e-06, + "loss": 0.2772, + "step": 1655 + }, + { + "epoch": 0.64, + "grad_norm": 3.2133288245929497, + "learning_rate": 3.0259929959017585e-06, + "loss": 0.0624, + "step": 1656 + }, + { + "epoch": 0.64, + "grad_norm": 4.3653314556421225, + "learning_rate": 3.0202401799848347e-06, + "loss": 0.49, + "step": 1657 + }, + { + "epoch": 0.64, + "grad_norm": 3.2388325557812734, + "learning_rate": 3.0144904704661413e-06, + "loss": 0.1567, + "step": 1658 + }, + { + "epoch": 0.64, + "grad_norm": 3.707238816625475, + "learning_rate": 3.0087438763674226e-06, + "loss": 0.2937, + "step": 1659 + }, + { + "epoch": 0.64, + "grad_norm": 4.53005153553539, + "learning_rate": 3.003000406705535e-06, + "loss": 0.1577, + "step": 1660 + }, + { + "epoch": 0.64, + "grad_norm": 5.058192745606313, + "learning_rate": 2.9972600704924348e-06, + "loss": 0.3803, + "step": 1661 + }, + { + "epoch": 0.64, + "grad_norm": 4.365488213512427, + "learning_rate": 2.991522876735154e-06, + "loss": 0.323, + "step": 1662 + }, + { + "epoch": 0.64, + "grad_norm": 5.502784046793897, + "learning_rate": 2.985788834435802e-06, + "loss": 0.6248, + "step": 1663 + }, + { + "epoch": 0.64, + "grad_norm": 4.360244546359318, + "learning_rate": 2.9800579525915394e-06, + "loss": 0.5125, + "step": 1664 + }, + { + "epoch": 0.64, + "grad_norm": 5.882626192669339, + "learning_rate": 2.9743302401945707e-06, + "loss": 0.4606, + "step": 1665 + }, + { + "epoch": 0.64, + "grad_norm": 4.952718683598004, + "learning_rate": 2.9686057062321226e-06, + "loss": 0.2694, + "step": 1666 + }, + { + "epoch": 0.64, + "grad_norm": 3.2532355007595117, + "learning_rate": 2.9628843596864386e-06, + "loss": 0.3237, + "step": 1667 + }, + { + "epoch": 0.64, + "grad_norm": 4.525233809899968, + "learning_rate": 2.95716620953476e-06, + "loss": 0.4447, + "step": 1668 + }, + { + "epoch": 0.65, + "grad_norm": 3.30406452613882, + "learning_rate": 2.9514512647493123e-06, + "loss": 0.1828, + "step": 1669 + }, + { + "epoch": 0.65, + "grad_norm": 4.100129586502264, + "learning_rate": 2.9457395342972904e-06, + "loss": 0.2453, + "step": 1670 + }, + { + "epoch": 0.65, + "grad_norm": 4.798501317618989, + "learning_rate": 2.940031027140848e-06, + "loss": 0.2783, + "step": 1671 + }, + { + "epoch": 0.65, + "grad_norm": 4.239549366716743, + "learning_rate": 2.9343257522370784e-06, + "loss": 0.33, + "step": 1672 + }, + { + "epoch": 0.65, + "grad_norm": 3.849134728746657, + "learning_rate": 2.928623718538006e-06, + "loss": 0.1592, + "step": 1673 + }, + { + "epoch": 0.65, + "grad_norm": 4.467559854123593, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.5214, + "step": 1674 + }, + { + "epoch": 0.65, + "grad_norm": 5.066321792108032, + "learning_rate": 2.917229410536604e-06, + "loss": 0.5847, + "step": 1675 + }, + { + "epoch": 0.65, + "grad_norm": 3.7684968096427176, + "learning_rate": 2.9115371541128344e-06, + "loss": 0.3402, + "step": 1676 + }, + { + "epoch": 0.65, + "grad_norm": 6.770360438291747, + "learning_rate": 2.905848174650856e-06, + "loss": 0.1709, + "step": 1677 + }, + { + "epoch": 0.65, + "grad_norm": 3.0763589893400987, + "learning_rate": 2.900162481077126e-06, + "loss": 0.3201, + "step": 1678 + }, + { + "epoch": 0.65, + "grad_norm": 3.6197372876556435, + "learning_rate": 2.8944800823129413e-06, + "loss": 0.2598, + "step": 1679 + }, + { + "epoch": 0.65, + "grad_norm": 6.290460315443914, + "learning_rate": 2.8888009872744332e-06, + "loss": 0.5132, + "step": 1680 + }, + { + "epoch": 0.65, + "grad_norm": 3.8173794768215568, + "learning_rate": 2.883125204872542e-06, + "loss": 0.2549, + "step": 1681 + }, + { + "epoch": 0.65, + "grad_norm": 4.439447427484574, + "learning_rate": 2.8774527440130173e-06, + "loss": 0.2146, + "step": 1682 + }, + { + "epoch": 0.65, + "grad_norm": 5.222232351099214, + "learning_rate": 2.8717836135963937e-06, + "loss": 0.5238, + "step": 1683 + }, + { + "epoch": 0.65, + "grad_norm": 3.582334617398952, + "learning_rate": 2.866117822517982e-06, + "loss": 0.3758, + "step": 1684 + }, + { + "epoch": 0.65, + "grad_norm": 5.959571024791742, + "learning_rate": 2.860455379667851e-06, + "loss": 0.156, + "step": 1685 + }, + { + "epoch": 0.65, + "grad_norm": 4.931363949721377, + "learning_rate": 2.8547962939308187e-06, + "loss": 0.4552, + "step": 1686 + }, + { + "epoch": 0.65, + "grad_norm": 5.3040823343772985, + "learning_rate": 2.8491405741864323e-06, + "loss": 0.3618, + "step": 1687 + }, + { + "epoch": 0.65, + "grad_norm": 3.607659758841999, + "learning_rate": 2.8434882293089607e-06, + "loss": 0.3063, + "step": 1688 + }, + { + "epoch": 0.65, + "grad_norm": 4.702741810993404, + "learning_rate": 2.837839268167373e-06, + "loss": 0.2151, + "step": 1689 + }, + { + "epoch": 0.65, + "grad_norm": 3.211298705112521, + "learning_rate": 2.8321936996253368e-06, + "loss": 0.3437, + "step": 1690 + }, + { + "epoch": 0.65, + "grad_norm": 4.345823636413663, + "learning_rate": 2.8265515325411856e-06, + "loss": 0.4581, + "step": 1691 + }, + { + "epoch": 0.65, + "grad_norm": 3.443663189487989, + "learning_rate": 2.8209127757679246e-06, + "loss": 0.1207, + "step": 1692 + }, + { + "epoch": 0.65, + "grad_norm": 4.209942661911642, + "learning_rate": 2.8152774381532033e-06, + "loss": 0.4597, + "step": 1693 + }, + { + "epoch": 0.65, + "grad_norm": 4.399312147279553, + "learning_rate": 2.8096455285393094e-06, + "loss": 0.2113, + "step": 1694 + }, + { + "epoch": 0.66, + "grad_norm": 4.174749198687062, + "learning_rate": 2.804017055763149e-06, + "loss": 0.4833, + "step": 1695 + }, + { + "epoch": 0.66, + "grad_norm": 4.1444824944970495, + "learning_rate": 2.798392028656237e-06, + "loss": 0.2804, + "step": 1696 + }, + { + "epoch": 0.66, + "grad_norm": 4.044934464655802, + "learning_rate": 2.792770456044683e-06, + "loss": 0.4196, + "step": 1697 + }, + { + "epoch": 0.66, + "grad_norm": 4.341011730415659, + "learning_rate": 2.787152346749173e-06, + "loss": 0.27, + "step": 1698 + }, + { + "epoch": 0.66, + "grad_norm": 3.1951400779268866, + "learning_rate": 2.7815377095849612e-06, + "loss": 0.1069, + "step": 1699 + }, + { + "epoch": 0.66, + "grad_norm": 5.924086035872311, + "learning_rate": 2.775926553361855e-06, + "loss": 0.4745, + "step": 1700 + }, + { + "epoch": 0.66, + "grad_norm": 5.3989896193178994, + "learning_rate": 2.7703188868841936e-06, + "loss": 0.4895, + "step": 1701 + }, + { + "epoch": 0.66, + "grad_norm": 3.9757493138429996, + "learning_rate": 2.7647147189508485e-06, + "loss": 0.1893, + "step": 1702 + }, + { + "epoch": 0.66, + "grad_norm": 4.039018022787459, + "learning_rate": 2.7591140583551956e-06, + "loss": 0.315, + "step": 1703 + }, + { + "epoch": 0.66, + "grad_norm": 3.6680430669769373, + "learning_rate": 2.7535169138851124e-06, + "loss": 0.1205, + "step": 1704 + }, + { + "epoch": 0.66, + "grad_norm": 3.394568294103908, + "learning_rate": 2.7479232943229567e-06, + "loss": 0.1386, + "step": 1705 + }, + { + "epoch": 0.66, + "grad_norm": 3.920795596494274, + "learning_rate": 2.7423332084455543e-06, + "loss": 0.1672, + "step": 1706 + }, + { + "epoch": 0.66, + "grad_norm": 5.109653066753222, + "learning_rate": 2.7367466650241868e-06, + "loss": 0.586, + "step": 1707 + }, + { + "epoch": 0.66, + "grad_norm": 4.636956032652642, + "learning_rate": 2.731163672824583e-06, + "loss": 0.2288, + "step": 1708 + }, + { + "epoch": 0.66, + "grad_norm": 3.808929072946402, + "learning_rate": 2.7255842406068917e-06, + "loss": 0.521, + "step": 1709 + }, + { + "epoch": 0.66, + "grad_norm": 5.075018120655451, + "learning_rate": 2.720008377125682e-06, + "loss": 0.363, + "step": 1710 + }, + { + "epoch": 0.66, + "grad_norm": 4.459381989696827, + "learning_rate": 2.7144360911299158e-06, + "loss": 0.2035, + "step": 1711 + }, + { + "epoch": 0.66, + "grad_norm": 6.3473002345601435, + "learning_rate": 2.708867391362948e-06, + "loss": 0.4227, + "step": 1712 + }, + { + "epoch": 0.66, + "grad_norm": 3.9174475662573927, + "learning_rate": 2.703302286562506e-06, + "loss": 0.1723, + "step": 1713 + }, + { + "epoch": 0.66, + "grad_norm": 3.7138861041297355, + "learning_rate": 2.697740785460675e-06, + "loss": 0.3219, + "step": 1714 + }, + { + "epoch": 0.66, + "grad_norm": 4.143243281149177, + "learning_rate": 2.692182896783885e-06, + "loss": 0.6009, + "step": 1715 + }, + { + "epoch": 0.66, + "grad_norm": 6.554380657981959, + "learning_rate": 2.686628629252899e-06, + "loss": 0.2703, + "step": 1716 + }, + { + "epoch": 0.66, + "grad_norm": 3.018761179383061, + "learning_rate": 2.681077991582797e-06, + "loss": 0.3545, + "step": 1717 + }, + { + "epoch": 0.66, + "grad_norm": 4.542355393780447, + "learning_rate": 2.6755309924829657e-06, + "loss": 0.1874, + "step": 1718 + }, + { + "epoch": 0.66, + "grad_norm": 4.514922915379382, + "learning_rate": 2.6699876406570823e-06, + "loss": 0.1727, + "step": 1719 + }, + { + "epoch": 0.66, + "grad_norm": 3.0740736807638944, + "learning_rate": 2.664447944803097e-06, + "loss": 0.1471, + "step": 1720 + }, + { + "epoch": 0.67, + "grad_norm": 6.20581139373578, + "learning_rate": 2.6589119136132273e-06, + "loss": 0.6312, + "step": 1721 + }, + { + "epoch": 0.67, + "grad_norm": 4.256503390802347, + "learning_rate": 2.6533795557739407e-06, + "loss": 0.4009, + "step": 1722 + }, + { + "epoch": 0.67, + "grad_norm": 4.387740355612719, + "learning_rate": 2.6478508799659393e-06, + "loss": 0.3199, + "step": 1723 + }, + { + "epoch": 0.67, + "grad_norm": 5.968641510398305, + "learning_rate": 2.6423258948641483e-06, + "loss": 0.2967, + "step": 1724 + }, + { + "epoch": 0.67, + "grad_norm": 2.7168503542752256, + "learning_rate": 2.636804609137703e-06, + "loss": 0.322, + "step": 1725 + }, + { + "epoch": 0.67, + "grad_norm": 4.447583109886695, + "learning_rate": 2.6312870314499335e-06, + "loss": 0.3068, + "step": 1726 + }, + { + "epoch": 0.67, + "grad_norm": 5.9977352996698166, + "learning_rate": 2.62577317045835e-06, + "loss": 0.5861, + "step": 1727 + }, + { + "epoch": 0.67, + "grad_norm": 4.047519665534339, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.4104, + "step": 1728 + }, + { + "epoch": 0.67, + "grad_norm": 4.99157274852523, + "learning_rate": 2.614756633164618e-06, + "loss": 0.2417, + "step": 1729 + }, + { + "epoch": 0.67, + "grad_norm": 3.056993343008954, + "learning_rate": 2.609253974148278e-06, + "loss": 0.2285, + "step": 1730 + }, + { + "epoch": 0.67, + "grad_norm": 3.676483893783997, + "learning_rate": 2.603755066399718e-06, + "loss": 0.4048, + "step": 1731 + }, + { + "epoch": 0.67, + "grad_norm": 4.302491960324539, + "learning_rate": 2.5982599185471535e-06, + "loss": 0.4063, + "step": 1732 + }, + { + "epoch": 0.67, + "grad_norm": 5.8631562321598905, + "learning_rate": 2.5927685392129033e-06, + "loss": 0.2333, + "step": 1733 + }, + { + "epoch": 0.67, + "grad_norm": 3.929943829504728, + "learning_rate": 2.5872809370133704e-06, + "loss": 0.2795, + "step": 1734 + }, + { + "epoch": 0.67, + "grad_norm": 3.844718932304175, + "learning_rate": 2.5817971205590343e-06, + "loss": 0.3706, + "step": 1735 + }, + { + "epoch": 0.67, + "grad_norm": 5.804649506267364, + "learning_rate": 2.5763170984544304e-06, + "loss": 0.4786, + "step": 1736 + }, + { + "epoch": 0.67, + "grad_norm": 3.680940953198759, + "learning_rate": 2.5708408792981443e-06, + "loss": 0.2343, + "step": 1737 + }, + { + "epoch": 0.67, + "grad_norm": 4.491004565731436, + "learning_rate": 2.5653684716827904e-06, + "loss": 0.4233, + "step": 1738 + }, + { + "epoch": 0.67, + "grad_norm": 3.71514538609898, + "learning_rate": 2.5598998841950105e-06, + "loss": 0.411, + "step": 1739 + }, + { + "epoch": 0.67, + "grad_norm": 3.6820129989056176, + "learning_rate": 2.5544351254154407e-06, + "loss": 0.1074, + "step": 1740 + }, + { + "epoch": 0.67, + "grad_norm": 3.4278678511955714, + "learning_rate": 2.5489742039187184e-06, + "loss": 0.3178, + "step": 1741 + }, + { + "epoch": 0.67, + "grad_norm": 5.991000421170609, + "learning_rate": 2.5435171282734563e-06, + "loss": 0.4246, + "step": 1742 + }, + { + "epoch": 0.67, + "grad_norm": 4.116980081819, + "learning_rate": 2.5380639070422343e-06, + "loss": 0.1699, + "step": 1743 + }, + { + "epoch": 0.67, + "grad_norm": 4.606339347586722, + "learning_rate": 2.5326145487815822e-06, + "loss": 0.2506, + "step": 1744 + }, + { + "epoch": 0.67, + "grad_norm": 2.978536746939724, + "learning_rate": 2.527169062041972e-06, + "loss": 0.2521, + "step": 1745 + }, + { + "epoch": 0.68, + "grad_norm": 2.542237963771189, + "learning_rate": 2.5217274553677975e-06, + "loss": 0.303, + "step": 1746 + }, + { + "epoch": 0.68, + "grad_norm": 7.783993323145913, + "learning_rate": 2.516289737297366e-06, + "loss": 0.7893, + "step": 1747 + }, + { + "epoch": 0.68, + "grad_norm": 5.1531787428384455, + "learning_rate": 2.510855916362884e-06, + "loss": 0.5545, + "step": 1748 + }, + { + "epoch": 0.68, + "grad_norm": 5.025596900208484, + "learning_rate": 2.5054260010904423e-06, + "loss": 0.4583, + "step": 1749 + }, + { + "epoch": 0.68, + "grad_norm": 5.534124333013831, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.4523, + "step": 1750 + }, + { + "epoch": 0.68, + "grad_norm": 4.347062562066108, + "learning_rate": 2.494577921605382e-06, + "loss": 0.2637, + "step": 1751 + }, + { + "epoch": 0.68, + "grad_norm": 3.5250474399781013, + "learning_rate": 2.489159774414252e-06, + "loss": 0.1547, + "step": 1752 + }, + { + "epoch": 0.68, + "grad_norm": 6.045552337721784, + "learning_rate": 2.4837455669281075e-06, + "loss": 0.3852, + "step": 1753 + }, + { + "epoch": 0.68, + "grad_norm": 5.580015984978388, + "learning_rate": 2.478335307642264e-06, + "loss": 0.1578, + "step": 1754 + }, + { + "epoch": 0.68, + "grad_norm": 4.232880365049796, + "learning_rate": 2.472929005045842e-06, + "loss": 0.3332, + "step": 1755 + }, + { + "epoch": 0.68, + "grad_norm": 5.046078565805272, + "learning_rate": 2.467526667621754e-06, + "loss": 0.461, + "step": 1756 + }, + { + "epoch": 0.68, + "grad_norm": 4.851610044152447, + "learning_rate": 2.4621283038466916e-06, + "loss": 0.4053, + "step": 1757 + }, + { + "epoch": 0.68, + "grad_norm": 4.970665341233688, + "learning_rate": 2.4567339221911086e-06, + "loss": 0.2675, + "step": 1758 + }, + { + "epoch": 0.68, + "grad_norm": 4.396785887035718, + "learning_rate": 2.451343531119215e-06, + "loss": 0.2001, + "step": 1759 + }, + { + "epoch": 0.68, + "grad_norm": 4.539376066279478, + "learning_rate": 2.4459571390889537e-06, + "loss": 0.3424, + "step": 1760 + }, + { + "epoch": 0.68, + "grad_norm": 4.5926404292727785, + "learning_rate": 2.4405747545519966e-06, + "loss": 0.2022, + "step": 1761 + }, + { + "epoch": 0.68, + "grad_norm": 3.980381363671957, + "learning_rate": 2.435196385953727e-06, + "loss": 0.2754, + "step": 1762 + }, + { + "epoch": 0.68, + "grad_norm": 4.563763971441686, + "learning_rate": 2.4298220417332263e-06, + "loss": 0.1919, + "step": 1763 + }, + { + "epoch": 0.68, + "grad_norm": 4.048892028409314, + "learning_rate": 2.424451730323261e-06, + "loss": 0.4567, + "step": 1764 + }, + { + "epoch": 0.68, + "grad_norm": 5.328049914773882, + "learning_rate": 2.4190854601502717e-06, + "loss": 0.1598, + "step": 1765 + }, + { + "epoch": 0.68, + "grad_norm": 4.898882776743654, + "learning_rate": 2.413723239634356e-06, + "loss": 0.4388, + "step": 1766 + }, + { + "epoch": 0.68, + "grad_norm": 4.0984047535774355, + "learning_rate": 2.4083650771892576e-06, + "loss": 0.0878, + "step": 1767 + }, + { + "epoch": 0.68, + "grad_norm": 4.127977143842788, + "learning_rate": 2.4030109812223534e-06, + "loss": 0.3876, + "step": 1768 + }, + { + "epoch": 0.68, + "grad_norm": 3.4286384315349805, + "learning_rate": 2.3976609601346395e-06, + "loss": 0.235, + "step": 1769 + }, + { + "epoch": 0.68, + "grad_norm": 5.219348625459872, + "learning_rate": 2.3923150223207176e-06, + "loss": 0.2831, + "step": 1770 + }, + { + "epoch": 0.68, + "grad_norm": 4.528009026452362, + "learning_rate": 2.386973176168783e-06, + "loss": 0.2909, + "step": 1771 + }, + { + "epoch": 0.69, + "grad_norm": 3.993414814418132, + "learning_rate": 2.381635430060611e-06, + "loss": 0.2571, + "step": 1772 + }, + { + "epoch": 0.69, + "grad_norm": 3.8509877796601737, + "learning_rate": 2.376301792371543e-06, + "loss": 0.2845, + "step": 1773 + }, + { + "epoch": 0.69, + "grad_norm": 4.431228556677509, + "learning_rate": 2.370972271470475e-06, + "loss": 0.2373, + "step": 1774 + }, + { + "epoch": 0.69, + "grad_norm": 4.903649421667131, + "learning_rate": 2.3656468757198414e-06, + "loss": 0.2355, + "step": 1775 + }, + { + "epoch": 0.69, + "grad_norm": 5.842661793218539, + "learning_rate": 2.3603256134756066e-06, + "loss": 0.3632, + "step": 1776 + }, + { + "epoch": 0.69, + "grad_norm": 3.8990150290189516, + "learning_rate": 2.3550084930872475e-06, + "loss": 0.1405, + "step": 1777 + }, + { + "epoch": 0.69, + "grad_norm": 5.273142419554314, + "learning_rate": 2.3496955228977437e-06, + "loss": 0.3316, + "step": 1778 + }, + { + "epoch": 0.69, + "grad_norm": 3.903402169816889, + "learning_rate": 2.3443867112435585e-06, + "loss": 0.1332, + "step": 1779 + }, + { + "epoch": 0.69, + "grad_norm": 5.167027410847691, + "learning_rate": 2.3390820664546353e-06, + "loss": 0.3948, + "step": 1780 + }, + { + "epoch": 0.69, + "grad_norm": 3.9550706014204544, + "learning_rate": 2.333781596854377e-06, + "loss": 0.1822, + "step": 1781 + }, + { + "epoch": 0.69, + "grad_norm": 3.897982596771436, + "learning_rate": 2.328485310759635e-06, + "loss": 0.4817, + "step": 1782 + }, + { + "epoch": 0.69, + "grad_norm": 3.641644997255952, + "learning_rate": 2.323193216480698e-06, + "loss": 0.1201, + "step": 1783 + }, + { + "epoch": 0.69, + "grad_norm": 5.785292260897342, + "learning_rate": 2.317905322321276e-06, + "loss": 0.3171, + "step": 1784 + }, + { + "epoch": 0.69, + "grad_norm": 5.1639291376943355, + "learning_rate": 2.3126216365784894e-06, + "loss": 0.3944, + "step": 1785 + }, + { + "epoch": 0.69, + "grad_norm": 6.233736261489142, + "learning_rate": 2.307342167542854e-06, + "loss": 0.517, + "step": 1786 + }, + { + "epoch": 0.69, + "grad_norm": 2.8494868708678367, + "learning_rate": 2.3020669234982713e-06, + "loss": 0.0983, + "step": 1787 + }, + { + "epoch": 0.69, + "grad_norm": 2.9212668064989864, + "learning_rate": 2.296795912722014e-06, + "loss": 0.2703, + "step": 1788 + }, + { + "epoch": 0.69, + "grad_norm": 4.893351719853449, + "learning_rate": 2.291529143484707e-06, + "loss": 0.2016, + "step": 1789 + }, + { + "epoch": 0.69, + "grad_norm": 5.775310475610977, + "learning_rate": 2.286266624050326e-06, + "loss": 0.2352, + "step": 1790 + }, + { + "epoch": 0.69, + "grad_norm": 5.861045659934974, + "learning_rate": 2.2810083626761757e-06, + "loss": 0.2019, + "step": 1791 + }, + { + "epoch": 0.69, + "grad_norm": 5.620298672464006, + "learning_rate": 2.275754367612881e-06, + "loss": 0.5436, + "step": 1792 + }, + { + "epoch": 0.69, + "grad_norm": 4.001508931792466, + "learning_rate": 2.2705046471043703e-06, + "loss": 0.3174, + "step": 1793 + }, + { + "epoch": 0.69, + "grad_norm": 5.592378740037039, + "learning_rate": 2.265259209387867e-06, + "loss": 0.3409, + "step": 1794 + }, + { + "epoch": 0.69, + "grad_norm": 5.571920468246785, + "learning_rate": 2.2600180626938738e-06, + "loss": 0.3798, + "step": 1795 + }, + { + "epoch": 0.69, + "grad_norm": 6.489737273497598, + "learning_rate": 2.25478121524616e-06, + "loss": 0.3132, + "step": 1796 + }, + { + "epoch": 0.69, + "grad_norm": 3.2552839858457787, + "learning_rate": 2.249548675261751e-06, + "loss": 0.1245, + "step": 1797 + }, + { + "epoch": 0.7, + "grad_norm": 5.697271869044669, + "learning_rate": 2.2443204509509094e-06, + "loss": 0.4708, + "step": 1798 + }, + { + "epoch": 0.7, + "grad_norm": 4.0946877009284846, + "learning_rate": 2.2390965505171304e-06, + "loss": 0.1455, + "step": 1799 + }, + { + "epoch": 0.7, + "grad_norm": 4.344739133195355, + "learning_rate": 2.2338769821571225e-06, + "loss": 0.17, + "step": 1800 + }, + { + "epoch": 0.7, + "grad_norm": 4.536375158972799, + "learning_rate": 2.2286617540607974e-06, + "loss": 0.2821, + "step": 1801 + }, + { + "epoch": 0.7, + "grad_norm": 3.6738225162461355, + "learning_rate": 2.2234508744112564e-06, + "loss": 0.2426, + "step": 1802 + }, + { + "epoch": 0.7, + "grad_norm": 5.560354059448164, + "learning_rate": 2.2182443513847777e-06, + "loss": 0.4775, + "step": 1803 + }, + { + "epoch": 0.7, + "grad_norm": 3.2596417176638885, + "learning_rate": 2.213042193150804e-06, + "loss": 0.2052, + "step": 1804 + }, + { + "epoch": 0.7, + "grad_norm": 3.9607589865283486, + "learning_rate": 2.207844407871929e-06, + "loss": 0.37, + "step": 1805 + }, + { + "epoch": 0.7, + "grad_norm": 3.8086407477970563, + "learning_rate": 2.202651003703885e-06, + "loss": 0.4081, + "step": 1806 + }, + { + "epoch": 0.7, + "grad_norm": 4.034000667010573, + "learning_rate": 2.1974619887955294e-06, + "loss": 0.2258, + "step": 1807 + }, + { + "epoch": 0.7, + "grad_norm": 3.4654356360972334, + "learning_rate": 2.1922773712888356e-06, + "loss": 0.5445, + "step": 1808 + }, + { + "epoch": 0.7, + "grad_norm": 5.100062974354942, + "learning_rate": 2.1870971593188704e-06, + "loss": 0.2674, + "step": 1809 + }, + { + "epoch": 0.7, + "grad_norm": 4.531630835810501, + "learning_rate": 2.181921361013794e-06, + "loss": 0.1823, + "step": 1810 + }, + { + "epoch": 0.7, + "grad_norm": 3.4791204312655166, + "learning_rate": 2.17674998449484e-06, + "loss": 0.4051, + "step": 1811 + }, + { + "epoch": 0.7, + "grad_norm": 3.6143879526477787, + "learning_rate": 2.1715830378763025e-06, + "loss": 0.276, + "step": 1812 + }, + { + "epoch": 0.7, + "grad_norm": 3.9283119165737888, + "learning_rate": 2.1664205292655253e-06, + "loss": 0.1797, + "step": 1813 + }, + { + "epoch": 0.7, + "grad_norm": 3.558243741990926, + "learning_rate": 2.16126246676289e-06, + "loss": 0.397, + "step": 1814 + }, + { + "epoch": 0.7, + "grad_norm": 5.279406464430176, + "learning_rate": 2.1561088584617995e-06, + "loss": 0.4484, + "step": 1815 + }, + { + "epoch": 0.7, + "grad_norm": 4.822172198039966, + "learning_rate": 2.1509597124486693e-06, + "loss": 0.5161, + "step": 1816 + }, + { + "epoch": 0.7, + "grad_norm": 3.8481469598497378, + "learning_rate": 2.1458150368029146e-06, + "loss": 0.4062, + "step": 1817 + }, + { + "epoch": 0.7, + "grad_norm": 5.171123093227522, + "learning_rate": 2.140674839596931e-06, + "loss": 0.4381, + "step": 1818 + }, + { + "epoch": 0.7, + "grad_norm": 3.7516300336535124, + "learning_rate": 2.135539128896092e-06, + "loss": 0.228, + "step": 1819 + }, + { + "epoch": 0.7, + "grad_norm": 5.1320863200582485, + "learning_rate": 2.13040791275873e-06, + "loss": 0.4194, + "step": 1820 + }, + { + "epoch": 0.7, + "grad_norm": 5.013216837242059, + "learning_rate": 2.1252811992361256e-06, + "loss": 0.2301, + "step": 1821 + }, + { + "epoch": 0.7, + "grad_norm": 3.4767324138964217, + "learning_rate": 2.1201589963724933e-06, + "loss": 0.2864, + "step": 1822 + }, + { + "epoch": 0.7, + "grad_norm": 4.9590311234581295, + "learning_rate": 2.1150413122049714e-06, + "loss": 0.6389, + "step": 1823 + }, + { + "epoch": 0.71, + "grad_norm": 4.719787599208209, + "learning_rate": 2.109928154763606e-06, + "loss": 0.5455, + "step": 1824 + }, + { + "epoch": 0.71, + "grad_norm": 5.849412917078412, + "learning_rate": 2.104819532071343e-06, + "loss": 0.4647, + "step": 1825 + }, + { + "epoch": 0.71, + "grad_norm": 5.977819357950538, + "learning_rate": 2.09971545214401e-06, + "loss": 0.4172, + "step": 1826 + }, + { + "epoch": 0.71, + "grad_norm": 3.9708211287255444, + "learning_rate": 2.094615922990309e-06, + "loss": 0.3091, + "step": 1827 + }, + { + "epoch": 0.71, + "grad_norm": 4.347703838314115, + "learning_rate": 2.0895209526118016e-06, + "loss": 0.3192, + "step": 1828 + }, + { + "epoch": 0.71, + "grad_norm": 3.407786720546166, + "learning_rate": 2.084430549002894e-06, + "loss": 0.0593, + "step": 1829 + }, + { + "epoch": 0.71, + "grad_norm": 5.352516959592958, + "learning_rate": 2.0793447201508288e-06, + "loss": 0.2893, + "step": 1830 + }, + { + "epoch": 0.71, + "grad_norm": 3.9752958408104626, + "learning_rate": 2.0742634740356695e-06, + "loss": 0.3441, + "step": 1831 + }, + { + "epoch": 0.71, + "grad_norm": 3.844520877279769, + "learning_rate": 2.0691868186302898e-06, + "loss": 0.1656, + "step": 1832 + }, + { + "epoch": 0.71, + "grad_norm": 5.464660124764538, + "learning_rate": 2.064114761900359e-06, + "loss": 0.2321, + "step": 1833 + }, + { + "epoch": 0.71, + "grad_norm": 3.5319744273333855, + "learning_rate": 2.0590473118043326e-06, + "loss": 0.1348, + "step": 1834 + }, + { + "epoch": 0.71, + "grad_norm": 5.898414095415711, + "learning_rate": 2.0539844762934353e-06, + "loss": 0.4963, + "step": 1835 + }, + { + "epoch": 0.71, + "grad_norm": 5.087696125078068, + "learning_rate": 2.0489262633116536e-06, + "loss": 0.3278, + "step": 1836 + }, + { + "epoch": 0.71, + "grad_norm": 3.9744662973862352, + "learning_rate": 2.043872680795721e-06, + "loss": 0.2132, + "step": 1837 + }, + { + "epoch": 0.71, + "grad_norm": 2.2202779147594316, + "learning_rate": 2.0388237366751005e-06, + "loss": 0.0496, + "step": 1838 + }, + { + "epoch": 0.71, + "grad_norm": 2.6156922261128424, + "learning_rate": 2.0337794388719845e-06, + "loss": 0.1862, + "step": 1839 + }, + { + "epoch": 0.71, + "grad_norm": 4.314353663515789, + "learning_rate": 2.0287397953012686e-06, + "loss": 0.2203, + "step": 1840 + }, + { + "epoch": 0.71, + "grad_norm": 5.980969740133097, + "learning_rate": 2.023704813870551e-06, + "loss": 0.2546, + "step": 1841 + }, + { + "epoch": 0.71, + "grad_norm": 5.3102029918766345, + "learning_rate": 2.01867450248011e-06, + "loss": 0.1992, + "step": 1842 + }, + { + "epoch": 0.71, + "grad_norm": 6.546211276625146, + "learning_rate": 2.0136488690228993e-06, + "loss": 0.2234, + "step": 1843 + }, + { + "epoch": 0.71, + "grad_norm": 7.448963702862516, + "learning_rate": 2.008627921384531e-06, + "loss": 0.328, + "step": 1844 + }, + { + "epoch": 0.71, + "grad_norm": 4.607520018578633, + "learning_rate": 2.0036116674432653e-06, + "loss": 0.4105, + "step": 1845 + }, + { + "epoch": 0.71, + "grad_norm": 5.146275622462938, + "learning_rate": 1.998600115069998e-06, + "loss": 0.3217, + "step": 1846 + }, + { + "epoch": 0.71, + "grad_norm": 5.174020516804923, + "learning_rate": 1.993593272128248e-06, + "loss": 0.5677, + "step": 1847 + }, + { + "epoch": 0.71, + "grad_norm": 6.154539002930115, + "learning_rate": 1.9885911464741413e-06, + "loss": 0.3293, + "step": 1848 + }, + { + "epoch": 0.71, + "grad_norm": 4.6708980658065995, + "learning_rate": 1.9835937459564065e-06, + "loss": 0.1627, + "step": 1849 + }, + { + "epoch": 0.72, + "grad_norm": 3.2209070064120717, + "learning_rate": 1.978601078416357e-06, + "loss": 0.2464, + "step": 1850 + }, + { + "epoch": 0.72, + "grad_norm": 4.051538378905791, + "learning_rate": 1.9736131516878776e-06, + "loss": 0.391, + "step": 1851 + }, + { + "epoch": 0.72, + "grad_norm": 6.907065551145547, + "learning_rate": 1.9686299735974177e-06, + "loss": 0.4384, + "step": 1852 + }, + { + "epoch": 0.72, + "grad_norm": 4.043205777487744, + "learning_rate": 1.9636515519639735e-06, + "loss": 0.2174, + "step": 1853 + }, + { + "epoch": 0.72, + "grad_norm": 6.585680254109682, + "learning_rate": 1.9586778945990785e-06, + "loss": 0.46, + "step": 1854 + }, + { + "epoch": 0.72, + "grad_norm": 3.1220159311342015, + "learning_rate": 1.9537090093067907e-06, + "loss": 0.1915, + "step": 1855 + }, + { + "epoch": 0.72, + "grad_norm": 2.7753553128253463, + "learning_rate": 1.9487449038836803e-06, + "loss": 0.2192, + "step": 1856 + }, + { + "epoch": 0.72, + "grad_norm": 4.706950482884989, + "learning_rate": 1.9437855861188175e-06, + "loss": 0.2192, + "step": 1857 + }, + { + "epoch": 0.72, + "grad_norm": 3.811551914070741, + "learning_rate": 1.9388310637937606e-06, + "loss": 0.3117, + "step": 1858 + }, + { + "epoch": 0.72, + "grad_norm": 4.783095422388319, + "learning_rate": 1.933881344682543e-06, + "loss": 0.4881, + "step": 1859 + }, + { + "epoch": 0.72, + "grad_norm": 4.461477591229335, + "learning_rate": 1.928936436551661e-06, + "loss": 0.3832, + "step": 1860 + }, + { + "epoch": 0.72, + "grad_norm": 5.096882534563683, + "learning_rate": 1.9239963471600635e-06, + "loss": 0.1809, + "step": 1861 + }, + { + "epoch": 0.72, + "grad_norm": 4.03141917029654, + "learning_rate": 1.9190610842591386e-06, + "loss": 0.3769, + "step": 1862 + }, + { + "epoch": 0.72, + "grad_norm": 4.992746423129923, + "learning_rate": 1.9141306555926985e-06, + "loss": 0.1452, + "step": 1863 + }, + { + "epoch": 0.72, + "grad_norm": 2.509861142515103, + "learning_rate": 1.9092050688969736e-06, + "loss": 0.2118, + "step": 1864 + }, + { + "epoch": 0.72, + "grad_norm": 3.841888948721654, + "learning_rate": 1.9042843319005944e-06, + "loss": 0.2139, + "step": 1865 + }, + { + "epoch": 0.72, + "grad_norm": 4.2507697273324085, + "learning_rate": 1.8993684523245842e-06, + "loss": 0.4808, + "step": 1866 + }, + { + "epoch": 0.72, + "grad_norm": 4.091326455063292, + "learning_rate": 1.8944574378823406e-06, + "loss": 0.3611, + "step": 1867 + }, + { + "epoch": 0.72, + "grad_norm": 4.193311837378678, + "learning_rate": 1.8895512962796304e-06, + "loss": 0.2952, + "step": 1868 + }, + { + "epoch": 0.72, + "grad_norm": 4.128520130462512, + "learning_rate": 1.8846500352145753e-06, + "loss": 0.2216, + "step": 1869 + }, + { + "epoch": 0.72, + "grad_norm": 3.402403936519249, + "learning_rate": 1.879753662377637e-06, + "loss": 0.2593, + "step": 1870 + }, + { + "epoch": 0.72, + "grad_norm": 4.16036607149034, + "learning_rate": 1.874862185451608e-06, + "loss": 0.5864, + "step": 1871 + }, + { + "epoch": 0.72, + "grad_norm": 4.703495245666888, + "learning_rate": 1.8699756121115997e-06, + "loss": 0.1701, + "step": 1872 + }, + { + "epoch": 0.72, + "grad_norm": 6.227222988390471, + "learning_rate": 1.865093950025027e-06, + "loss": 0.4472, + "step": 1873 + }, + { + "epoch": 0.72, + "grad_norm": 3.6940826204907418, + "learning_rate": 1.8602172068516011e-06, + "loss": 0.1333, + "step": 1874 + }, + { + "epoch": 0.72, + "grad_norm": 4.087595824923308, + "learning_rate": 1.8553453902433144e-06, + "loss": 0.3781, + "step": 1875 + }, + { + "epoch": 0.73, + "grad_norm": 4.1949383247133545, + "learning_rate": 1.8504785078444293e-06, + "loss": 0.2486, + "step": 1876 + }, + { + "epoch": 0.73, + "grad_norm": 4.214626070351042, + "learning_rate": 1.8456165672914628e-06, + "loss": 0.4535, + "step": 1877 + }, + { + "epoch": 0.73, + "grad_norm": 2.335984815989052, + "learning_rate": 1.8407595762131814e-06, + "loss": 0.0587, + "step": 1878 + }, + { + "epoch": 0.73, + "grad_norm": 6.097901640087357, + "learning_rate": 1.8359075422305856e-06, + "loss": 0.2541, + "step": 1879 + }, + { + "epoch": 0.73, + "grad_norm": 3.4985719180072055, + "learning_rate": 1.8310604729568964e-06, + "loss": 0.4075, + "step": 1880 + }, + { + "epoch": 0.73, + "grad_norm": 6.265585961033121, + "learning_rate": 1.826218375997545e-06, + "loss": 0.5293, + "step": 1881 + }, + { + "epoch": 0.73, + "grad_norm": 6.017867853833034, + "learning_rate": 1.8213812589501611e-06, + "loss": 0.2587, + "step": 1882 + }, + { + "epoch": 0.73, + "grad_norm": 6.181760996941523, + "learning_rate": 1.8165491294045596e-06, + "loss": 0.1503, + "step": 1883 + }, + { + "epoch": 0.73, + "grad_norm": 4.4153449028242395, + "learning_rate": 1.811721994942731e-06, + "loss": 0.3288, + "step": 1884 + }, + { + "epoch": 0.73, + "grad_norm": 4.861232817903692, + "learning_rate": 1.8068998631388268e-06, + "loss": 0.5081, + "step": 1885 + }, + { + "epoch": 0.73, + "grad_norm": 5.140432536085061, + "learning_rate": 1.8020827415591496e-06, + "loss": 0.1753, + "step": 1886 + }, + { + "epoch": 0.73, + "grad_norm": 4.403257328693415, + "learning_rate": 1.7972706377621412e-06, + "loss": 0.2535, + "step": 1887 + }, + { + "epoch": 0.73, + "grad_norm": 4.702488954204706, + "learning_rate": 1.7924635592983687e-06, + "loss": 0.284, + "step": 1888 + }, + { + "epoch": 0.73, + "grad_norm": 3.8967907167704707, + "learning_rate": 1.7876615137105142e-06, + "loss": 0.3759, + "step": 1889 + }, + { + "epoch": 0.73, + "grad_norm": 3.9964616530740846, + "learning_rate": 1.7828645085333645e-06, + "loss": 0.1542, + "step": 1890 + }, + { + "epoch": 0.73, + "grad_norm": 4.137944778948514, + "learning_rate": 1.7780725512937964e-06, + "loss": 0.3305, + "step": 1891 + }, + { + "epoch": 0.73, + "grad_norm": 4.792807836796973, + "learning_rate": 1.7732856495107648e-06, + "loss": 0.3998, + "step": 1892 + }, + { + "epoch": 0.73, + "grad_norm": 5.4646948533354776, + "learning_rate": 1.7685038106952952e-06, + "loss": 0.4072, + "step": 1893 + }, + { + "epoch": 0.73, + "grad_norm": 3.590201368147436, + "learning_rate": 1.7637270423504664e-06, + "loss": 0.1166, + "step": 1894 + }, + { + "epoch": 0.73, + "grad_norm": 4.864879188400069, + "learning_rate": 1.7589553519714019e-06, + "loss": 0.3662, + "step": 1895 + }, + { + "epoch": 0.73, + "grad_norm": 4.0136210039162625, + "learning_rate": 1.7541887470452606e-06, + "loss": 0.194, + "step": 1896 + }, + { + "epoch": 0.73, + "grad_norm": 4.837556349737367, + "learning_rate": 1.7494272350512137e-06, + "loss": 0.417, + "step": 1897 + }, + { + "epoch": 0.73, + "grad_norm": 4.873319109693996, + "learning_rate": 1.7446708234604498e-06, + "loss": 0.395, + "step": 1898 + }, + { + "epoch": 0.73, + "grad_norm": 5.255409082820054, + "learning_rate": 1.7399195197361507e-06, + "loss": 0.3846, + "step": 1899 + }, + { + "epoch": 0.73, + "grad_norm": 7.184679040282454, + "learning_rate": 1.735173331333484e-06, + "loss": 0.206, + "step": 1900 + }, + { + "epoch": 0.73, + "grad_norm": 6.217040552255799, + "learning_rate": 1.7304322656995908e-06, + "loss": 0.4931, + "step": 1901 + }, + { + "epoch": 0.74, + "grad_norm": 3.9509138521280507, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.131, + "step": 1902 + }, + { + "epoch": 0.74, + "grad_norm": 3.4422246860188808, + "learning_rate": 1.7209655324864898e-06, + "loss": 0.2605, + "step": 1903 + }, + { + "epoch": 0.74, + "grad_norm": 4.732536136113016, + "learning_rate": 1.7162398797613284e-06, + "loss": 0.2366, + "step": 1904 + }, + { + "epoch": 0.74, + "grad_norm": 4.398890019070956, + "learning_rate": 1.7115193795130086e-06, + "loss": 0.1436, + "step": 1905 + }, + { + "epoch": 0.74, + "grad_norm": 4.896327349849701, + "learning_rate": 1.7068040391483676e-06, + "loss": 0.2583, + "step": 1906 + }, + { + "epoch": 0.74, + "grad_norm": 2.892523718757571, + "learning_rate": 1.7020938660661407e-06, + "loss": 0.2705, + "step": 1907 + }, + { + "epoch": 0.74, + "grad_norm": 3.584594296514275, + "learning_rate": 1.6973888676569594e-06, + "loss": 0.4623, + "step": 1908 + }, + { + "epoch": 0.74, + "grad_norm": 3.3928091988912925, + "learning_rate": 1.692689051303335e-06, + "loss": 0.3382, + "step": 1909 + }, + { + "epoch": 0.74, + "grad_norm": 4.985037371450277, + "learning_rate": 1.6879944243796477e-06, + "loss": 0.4469, + "step": 1910 + }, + { + "epoch": 0.74, + "grad_norm": 3.995767937726994, + "learning_rate": 1.6833049942521346e-06, + "loss": 0.1979, + "step": 1911 + }, + { + "epoch": 0.74, + "grad_norm": 3.344882901204299, + "learning_rate": 1.678620768278879e-06, + "loss": 0.3542, + "step": 1912 + }, + { + "epoch": 0.74, + "grad_norm": 5.315501959638579, + "learning_rate": 1.6739417538097986e-06, + "loss": 0.2108, + "step": 1913 + }, + { + "epoch": 0.74, + "grad_norm": 5.2631242630187645, + "learning_rate": 1.6692679581866334e-06, + "loss": 0.4056, + "step": 1914 + }, + { + "epoch": 0.74, + "grad_norm": 4.067063018617635, + "learning_rate": 1.6645993887429345e-06, + "loss": 0.1566, + "step": 1915 + }, + { + "epoch": 0.74, + "grad_norm": 3.8586377855324243, + "learning_rate": 1.6599360528040538e-06, + "loss": 0.3336, + "step": 1916 + }, + { + "epoch": 0.74, + "grad_norm": 3.72609549725891, + "learning_rate": 1.6552779576871297e-06, + "loss": 0.1219, + "step": 1917 + }, + { + "epoch": 0.74, + "grad_norm": 5.7225871897213025, + "learning_rate": 1.650625110701079e-06, + "loss": 0.3494, + "step": 1918 + }, + { + "epoch": 0.74, + "grad_norm": 5.0000787349216385, + "learning_rate": 1.6459775191465827e-06, + "loss": 0.3741, + "step": 1919 + }, + { + "epoch": 0.74, + "grad_norm": 4.2515474732936465, + "learning_rate": 1.6413351903160763e-06, + "loss": 0.2318, + "step": 1920 + }, + { + "epoch": 0.74, + "grad_norm": 3.9321237728452942, + "learning_rate": 1.6366981314937374e-06, + "loss": 0.1037, + "step": 1921 + }, + { + "epoch": 0.74, + "grad_norm": 4.494625429012883, + "learning_rate": 1.632066349955474e-06, + "loss": 0.463, + "step": 1922 + }, + { + "epoch": 0.74, + "grad_norm": 4.93056708515331, + "learning_rate": 1.6274398529689144e-06, + "loss": 0.532, + "step": 1923 + }, + { + "epoch": 0.74, + "grad_norm": 4.47652438721708, + "learning_rate": 1.6228186477933956e-06, + "loss": 0.2872, + "step": 1924 + }, + { + "epoch": 0.74, + "grad_norm": 5.057854019071931, + "learning_rate": 1.6182027416799505e-06, + "loss": 0.5433, + "step": 1925 + }, + { + "epoch": 0.74, + "grad_norm": 4.220721751604938, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.2726, + "step": 1926 + }, + { + "epoch": 0.74, + "grad_norm": 3.8912346045021518, + "learning_rate": 1.608986855601824e-06, + "loss": 0.238, + "step": 1927 + }, + { + "epoch": 0.75, + "grad_norm": 3.904987966064022, + "learning_rate": 1.60438689009759e-06, + "loss": 0.1876, + "step": 1928 + }, + { + "epoch": 0.75, + "grad_norm": 3.7473960548713343, + "learning_rate": 1.5997922525763015e-06, + "loss": 0.3482, + "step": 1929 + }, + { + "epoch": 0.75, + "grad_norm": 3.8209032042140376, + "learning_rate": 1.5952029502473032e-06, + "loss": 0.13, + "step": 1930 + }, + { + "epoch": 0.75, + "grad_norm": 4.00413022039406, + "learning_rate": 1.5906189903115704e-06, + "loss": 0.1374, + "step": 1931 + }, + { + "epoch": 0.75, + "grad_norm": 4.390396954765262, + "learning_rate": 1.5860403799616951e-06, + "loss": 0.2525, + "step": 1932 + }, + { + "epoch": 0.75, + "grad_norm": 5.68223887860946, + "learning_rate": 1.581467126381876e-06, + "loss": 0.5401, + "step": 1933 + }, + { + "epoch": 0.75, + "grad_norm": 3.9328192151159516, + "learning_rate": 1.5768992367479058e-06, + "loss": 0.3122, + "step": 1934 + }, + { + "epoch": 0.75, + "grad_norm": 4.148297088018799, + "learning_rate": 1.5723367182271632e-06, + "loss": 0.1883, + "step": 1935 + }, + { + "epoch": 0.75, + "grad_norm": 2.922163003108522, + "learning_rate": 1.5677795779785932e-06, + "loss": 0.0705, + "step": 1936 + }, + { + "epoch": 0.75, + "grad_norm": 4.108390157771879, + "learning_rate": 1.5632278231527081e-06, + "loss": 0.2291, + "step": 1937 + }, + { + "epoch": 0.75, + "grad_norm": 5.706029599468409, + "learning_rate": 1.5586814608915673e-06, + "loss": 0.3241, + "step": 1938 + }, + { + "epoch": 0.75, + "grad_norm": 4.8920567592092015, + "learning_rate": 1.55414049832877e-06, + "loss": 0.1642, + "step": 1939 + }, + { + "epoch": 0.75, + "grad_norm": 10.07836920347546, + "learning_rate": 1.549604942589441e-06, + "loss": 0.4499, + "step": 1940 + }, + { + "epoch": 0.75, + "grad_norm": 4.099638437227029, + "learning_rate": 1.5450748007902234e-06, + "loss": 0.4009, + "step": 1941 + }, + { + "epoch": 0.75, + "grad_norm": 4.7431169637360755, + "learning_rate": 1.5405500800392643e-06, + "loss": 0.209, + "step": 1942 + }, + { + "epoch": 0.75, + "grad_norm": 4.916367575937835, + "learning_rate": 1.5360307874362052e-06, + "loss": 0.5596, + "step": 1943 + }, + { + "epoch": 0.75, + "grad_norm": 4.878293599526504, + "learning_rate": 1.5315169300721694e-06, + "loss": 0.5261, + "step": 1944 + }, + { + "epoch": 0.75, + "grad_norm": 5.632173888229477, + "learning_rate": 1.5270085150297538e-06, + "loss": 0.2211, + "step": 1945 + }, + { + "epoch": 0.75, + "grad_norm": 6.180609088555902, + "learning_rate": 1.5225055493830132e-06, + "loss": 0.6547, + "step": 1946 + }, + { + "epoch": 0.75, + "grad_norm": 3.0562258716530493, + "learning_rate": 1.518008040197455e-06, + "loss": 0.3523, + "step": 1947 + }, + { + "epoch": 0.75, + "grad_norm": 3.30719723193215, + "learning_rate": 1.5135159945300232e-06, + "loss": 0.4574, + "step": 1948 + }, + { + "epoch": 0.75, + "grad_norm": 6.865967079964248, + "learning_rate": 1.5090294194290882e-06, + "loss": 0.1708, + "step": 1949 + }, + { + "epoch": 0.75, + "grad_norm": 4.1514956790858735, + "learning_rate": 1.5045483219344387e-06, + "loss": 0.4878, + "step": 1950 + }, + { + "epoch": 0.75, + "grad_norm": 3.948402002091091, + "learning_rate": 1.500072709077267e-06, + "loss": 0.1444, + "step": 1951 + }, + { + "epoch": 0.75, + "grad_norm": 4.764960830166332, + "learning_rate": 1.4956025878801611e-06, + "loss": 0.1403, + "step": 1952 + }, + { + "epoch": 0.76, + "grad_norm": 6.010936729544463, + "learning_rate": 1.4911379653570913e-06, + "loss": 0.4949, + "step": 1953 + }, + { + "epoch": 0.76, + "grad_norm": 5.301235148641327, + "learning_rate": 1.4866788485133988e-06, + "loss": 0.3474, + "step": 1954 + }, + { + "epoch": 0.76, + "grad_norm": 3.3562354621637662, + "learning_rate": 1.4822252443457896e-06, + "loss": 0.1339, + "step": 1955 + }, + { + "epoch": 0.76, + "grad_norm": 5.760096093544358, + "learning_rate": 1.4777771598423147e-06, + "loss": 0.427, + "step": 1956 + }, + { + "epoch": 0.76, + "grad_norm": 3.1391426484601626, + "learning_rate": 1.473334601982368e-06, + "loss": 0.3402, + "step": 1957 + }, + { + "epoch": 0.76, + "grad_norm": 5.193663684513153, + "learning_rate": 1.4688975777366716e-06, + "loss": 0.6715, + "step": 1958 + }, + { + "epoch": 0.76, + "grad_norm": 5.611091736111432, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.4865, + "step": 1959 + }, + { + "epoch": 0.76, + "grad_norm": 3.7290852229072966, + "learning_rate": 1.4600401579274876e-06, + "loss": 0.1898, + "step": 1960 + }, + { + "epoch": 0.76, + "grad_norm": 4.830029871514128, + "learning_rate": 1.455619776261986e-06, + "loss": 0.2779, + "step": 1961 + }, + { + "epoch": 0.76, + "grad_norm": 4.243605923130625, + "learning_rate": 1.4512049560066837e-06, + "loss": 0.5229, + "step": 1962 + }, + { + "epoch": 0.76, + "grad_norm": 3.871388634951142, + "learning_rate": 1.4467957040887792e-06, + "loss": 0.4501, + "step": 1963 + }, + { + "epoch": 0.76, + "grad_norm": 4.904867962592391, + "learning_rate": 1.442392027426735e-06, + "loss": 0.2435, + "step": 1964 + }, + { + "epoch": 0.76, + "grad_norm": 4.856088282844029, + "learning_rate": 1.4379939329302629e-06, + "loss": 0.212, + "step": 1965 + }, + { + "epoch": 0.76, + "grad_norm": 7.084729437993213, + "learning_rate": 1.433601427500318e-06, + "loss": 0.4676, + "step": 1966 + }, + { + "epoch": 0.76, + "grad_norm": 4.7455140531670175, + "learning_rate": 1.4292145180290856e-06, + "loss": 0.6632, + "step": 1967 + }, + { + "epoch": 0.76, + "grad_norm": 5.378019868076491, + "learning_rate": 1.4248332113999708e-06, + "loss": 0.5038, + "step": 1968 + }, + { + "epoch": 0.76, + "grad_norm": 5.647863656216499, + "learning_rate": 1.4204575144875871e-06, + "loss": 0.2726, + "step": 1969 + }, + { + "epoch": 0.76, + "grad_norm": 4.431152332928633, + "learning_rate": 1.4160874341577447e-06, + "loss": 0.2333, + "step": 1970 + }, + { + "epoch": 0.76, + "grad_norm": 4.495704443756592, + "learning_rate": 1.4117229772674424e-06, + "loss": 0.4257, + "step": 1971 + }, + { + "epoch": 0.76, + "grad_norm": 3.691984119348884, + "learning_rate": 1.4073641506648555e-06, + "loss": 0.1647, + "step": 1972 + }, + { + "epoch": 0.76, + "grad_norm": 5.037385327982083, + "learning_rate": 1.4030109611893233e-06, + "loss": 0.418, + "step": 1973 + }, + { + "epoch": 0.76, + "grad_norm": 4.718651261892519, + "learning_rate": 1.3986634156713418e-06, + "loss": 0.4115, + "step": 1974 + }, + { + "epoch": 0.76, + "grad_norm": 3.653336887853578, + "learning_rate": 1.3943215209325505e-06, + "loss": 0.1272, + "step": 1975 + }, + { + "epoch": 0.76, + "grad_norm": 4.427301242675204, + "learning_rate": 1.3899852837857219e-06, + "loss": 0.3621, + "step": 1976 + }, + { + "epoch": 0.76, + "grad_norm": 5.912708632765017, + "learning_rate": 1.3856547110347523e-06, + "loss": 0.3707, + "step": 1977 + }, + { + "epoch": 0.76, + "grad_norm": 5.75903615667389, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.2424, + "step": 1978 + }, + { + "epoch": 0.77, + "grad_norm": 3.9543030958828203, + "learning_rate": 1.3770105858915217e-06, + "loss": 0.1327, + "step": 1979 + }, + { + "epoch": 0.77, + "grad_norm": 5.073839072996145, + "learning_rate": 1.3726970470625705e-06, + "loss": 0.2441, + "step": 1980 + }, + { + "epoch": 0.77, + "grad_norm": 5.248038819366711, + "learning_rate": 1.368389199756075e-06, + "loss": 0.4179, + "step": 1981 + }, + { + "epoch": 0.77, + "grad_norm": 3.706165976752562, + "learning_rate": 1.3640870507313859e-06, + "loss": 0.4898, + "step": 1982 + }, + { + "epoch": 0.77, + "grad_norm": 3.6784957994753045, + "learning_rate": 1.3597906067389116e-06, + "loss": 0.2817, + "step": 1983 + }, + { + "epoch": 0.77, + "grad_norm": 4.108801191012866, + "learning_rate": 1.3554998745201114e-06, + "loss": 0.189, + "step": 1984 + }, + { + "epoch": 0.77, + "grad_norm": 4.2833266169990605, + "learning_rate": 1.351214860807476e-06, + "loss": 0.4302, + "step": 1985 + }, + { + "epoch": 0.77, + "grad_norm": 4.046966896037209, + "learning_rate": 1.3469355723245303e-06, + "loss": 0.2545, + "step": 1986 + }, + { + "epoch": 0.77, + "grad_norm": 4.681410091351572, + "learning_rate": 1.3426620157858122e-06, + "loss": 0.3114, + "step": 1987 + }, + { + "epoch": 0.77, + "grad_norm": 4.520905347693073, + "learning_rate": 1.3383941978968673e-06, + "loss": 0.3039, + "step": 1988 + }, + { + "epoch": 0.77, + "grad_norm": 3.2925634956430687, + "learning_rate": 1.334132125354236e-06, + "loss": 0.0927, + "step": 1989 + }, + { + "epoch": 0.77, + "grad_norm": 5.6608381582152525, + "learning_rate": 1.3298758048454436e-06, + "loss": 0.2809, + "step": 1990 + }, + { + "epoch": 0.77, + "grad_norm": 4.598559928535005, + "learning_rate": 1.3256252430489907e-06, + "loss": 0.3549, + "step": 1991 + }, + { + "epoch": 0.77, + "grad_norm": 3.750943366975744, + "learning_rate": 1.321380446634342e-06, + "loss": 0.4004, + "step": 1992 + }, + { + "epoch": 0.77, + "grad_norm": 4.582402456253899, + "learning_rate": 1.317141422261915e-06, + "loss": 0.1714, + "step": 1993 + }, + { + "epoch": 0.77, + "grad_norm": 4.9781433109450575, + "learning_rate": 1.3129081765830725e-06, + "loss": 0.2884, + "step": 1994 + }, + { + "epoch": 0.77, + "grad_norm": 3.983333650655471, + "learning_rate": 1.308680716240106e-06, + "loss": 0.3647, + "step": 1995 + }, + { + "epoch": 0.77, + "grad_norm": 4.037252926870133, + "learning_rate": 1.3044590478662328e-06, + "loss": 0.2001, + "step": 1996 + }, + { + "epoch": 0.77, + "grad_norm": 4.7007787473476625, + "learning_rate": 1.3002431780855817e-06, + "loss": 0.4664, + "step": 1997 + }, + { + "epoch": 0.77, + "grad_norm": 4.259652229021513, + "learning_rate": 1.2960331135131826e-06, + "loss": 0.5181, + "step": 1998 + }, + { + "epoch": 0.77, + "grad_norm": 3.8749742006193864, + "learning_rate": 1.2918288607549568e-06, + "loss": 0.2406, + "step": 1999 + }, + { + "epoch": 0.77, + "grad_norm": 4.79999946453577, + "learning_rate": 1.2876304264077056e-06, + "loss": 0.23, + "step": 2000 + }, + { + "epoch": 0.77, + "grad_norm": 4.928603656545691, + "learning_rate": 1.2834378170591017e-06, + "loss": 0.4401, + "step": 2001 + }, + { + "epoch": 0.77, + "grad_norm": 4.265612866438301, + "learning_rate": 1.2792510392876777e-06, + "loss": 0.397, + "step": 2002 + }, + { + "epoch": 0.77, + "grad_norm": 4.888399383415932, + "learning_rate": 1.275070099662815e-06, + "loss": 0.4213, + "step": 2003 + }, + { + "epoch": 0.77, + "grad_norm": 5.944330017371113, + "learning_rate": 1.270895004744737e-06, + "loss": 0.4076, + "step": 2004 + }, + { + "epoch": 0.78, + "grad_norm": 3.800461573265687, + "learning_rate": 1.266725761084493e-06, + "loss": 0.2683, + "step": 2005 + }, + { + "epoch": 0.78, + "grad_norm": 3.6299485297277014, + "learning_rate": 1.262562375223954e-06, + "loss": 0.1161, + "step": 2006 + }, + { + "epoch": 0.78, + "grad_norm": 5.558917516160953, + "learning_rate": 1.2584048536957982e-06, + "loss": 0.3336, + "step": 2007 + }, + { + "epoch": 0.78, + "grad_norm": 4.156952009933042, + "learning_rate": 1.2542532030235022e-06, + "loss": 0.1929, + "step": 2008 + }, + { + "epoch": 0.78, + "grad_norm": 4.194404639752258, + "learning_rate": 1.2501074297213312e-06, + "loss": 0.1215, + "step": 2009 + }, + { + "epoch": 0.78, + "grad_norm": 4.46505852031427, + "learning_rate": 1.245967540294329e-06, + "loss": 0.3978, + "step": 2010 + }, + { + "epoch": 0.78, + "grad_norm": 4.307157765575073, + "learning_rate": 1.2418335412383048e-06, + "loss": 0.2229, + "step": 2011 + }, + { + "epoch": 0.78, + "grad_norm": 3.316879722224256, + "learning_rate": 1.2377054390398285e-06, + "loss": 0.0755, + "step": 2012 + }, + { + "epoch": 0.78, + "grad_norm": 2.837458208310748, + "learning_rate": 1.233583240176217e-06, + "loss": 0.0884, + "step": 2013 + }, + { + "epoch": 0.78, + "grad_norm": 5.127736170224491, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.4923, + "step": 2014 + }, + { + "epoch": 0.78, + "grad_norm": 4.762526526414984, + "learning_rate": 1.2253565783165178e-06, + "loss": 0.1324, + "step": 2015 + }, + { + "epoch": 0.78, + "grad_norm": 5.119209105814661, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.3451, + "step": 2016 + }, + { + "epoch": 0.78, + "grad_norm": 5.572573493797165, + "learning_rate": 1.2171536072922968e-06, + "loss": 0.4531, + "step": 2017 + }, + { + "epoch": 0.78, + "grad_norm": 3.087635891798809, + "learning_rate": 1.2130610219381811e-06, + "loss": 0.3802, + "step": 2018 + }, + { + "epoch": 0.78, + "grad_norm": 5.675084661173655, + "learning_rate": 1.2089743785879493e-06, + "loss": 0.2072, + "step": 2019 + }, + { + "epoch": 0.78, + "grad_norm": 4.110603688136279, + "learning_rate": 1.204893683653865e-06, + "loss": 0.1613, + "step": 2020 + }, + { + "epoch": 0.78, + "grad_norm": 3.7380721050272148, + "learning_rate": 1.2008189435388578e-06, + "loss": 0.3018, + "step": 2021 + }, + { + "epoch": 0.78, + "grad_norm": 2.6167838386004427, + "learning_rate": 1.1967501646365147e-06, + "loss": 0.0758, + "step": 2022 + }, + { + "epoch": 0.78, + "grad_norm": 5.207302312264849, + "learning_rate": 1.1926873533310696e-06, + "loss": 0.1603, + "step": 2023 + }, + { + "epoch": 0.78, + "grad_norm": 3.399434592957552, + "learning_rate": 1.1886305159973883e-06, + "loss": 0.1417, + "step": 2024 + }, + { + "epoch": 0.78, + "grad_norm": 3.914128828875052, + "learning_rate": 1.1845796590009684e-06, + "loss": 0.1855, + "step": 2025 + }, + { + "epoch": 0.78, + "grad_norm": 5.8691992231252135, + "learning_rate": 1.1805347886979219e-06, + "loss": 0.4489, + "step": 2026 + }, + { + "epoch": 0.78, + "grad_norm": 4.262352832761651, + "learning_rate": 1.1764959114349666e-06, + "loss": 0.1457, + "step": 2027 + }, + { + "epoch": 0.78, + "grad_norm": 2.624405744820009, + "learning_rate": 1.172463033549418e-06, + "loss": 0.1777, + "step": 2028 + }, + { + "epoch": 0.78, + "grad_norm": 6.025271662536933, + "learning_rate": 1.1684361613691769e-06, + "loss": 0.2803, + "step": 2029 + }, + { + "epoch": 0.78, + "grad_norm": 3.9066979854150494, + "learning_rate": 1.1644153012127208e-06, + "loss": 0.2028, + "step": 2030 + }, + { + "epoch": 0.79, + "grad_norm": 4.9564926271561855, + "learning_rate": 1.160400459389095e-06, + "loss": 0.372, + "step": 2031 + }, + { + "epoch": 0.79, + "grad_norm": 4.479560878347843, + "learning_rate": 1.1563916421979004e-06, + "loss": 0.257, + "step": 2032 + }, + { + "epoch": 0.79, + "grad_norm": 4.758125967710335, + "learning_rate": 1.1523888559292857e-06, + "loss": 0.1786, + "step": 2033 + }, + { + "epoch": 0.79, + "grad_norm": 3.1132504235175125, + "learning_rate": 1.1483921068639353e-06, + "loss": 0.2248, + "step": 2034 + }, + { + "epoch": 0.79, + "grad_norm": 4.532879144747303, + "learning_rate": 1.144401401273062e-06, + "loss": 0.3273, + "step": 2035 + }, + { + "epoch": 0.79, + "grad_norm": 4.182532557670968, + "learning_rate": 1.1404167454183957e-06, + "loss": 0.2285, + "step": 2036 + }, + { + "epoch": 0.79, + "grad_norm": 6.350719304663705, + "learning_rate": 1.1364381455521728e-06, + "loss": 0.2246, + "step": 2037 + }, + { + "epoch": 0.79, + "grad_norm": 3.9694165033268005, + "learning_rate": 1.1324656079171288e-06, + "loss": 0.1475, + "step": 2038 + }, + { + "epoch": 0.79, + "grad_norm": 5.2326102236425305, + "learning_rate": 1.128499138746486e-06, + "loss": 0.4002, + "step": 2039 + }, + { + "epoch": 0.79, + "grad_norm": 5.30206636616595, + "learning_rate": 1.1245387442639456e-06, + "loss": 0.2632, + "step": 2040 + }, + { + "epoch": 0.79, + "grad_norm": 5.138805068569877, + "learning_rate": 1.120584430683676e-06, + "loss": 0.5556, + "step": 2041 + }, + { + "epoch": 0.79, + "grad_norm": 7.664457648996525, + "learning_rate": 1.1166362042103056e-06, + "loss": 0.2151, + "step": 2042 + }, + { + "epoch": 0.79, + "grad_norm": 6.481110043185694, + "learning_rate": 1.1126940710389128e-06, + "loss": 0.4308, + "step": 2043 + }, + { + "epoch": 0.79, + "grad_norm": 4.916600677596624, + "learning_rate": 1.10875803735501e-06, + "loss": 0.3834, + "step": 2044 + }, + { + "epoch": 0.79, + "grad_norm": 4.043080724308969, + "learning_rate": 1.1048281093345437e-06, + "loss": 0.1925, + "step": 2045 + }, + { + "epoch": 0.79, + "grad_norm": 4.558594051183082, + "learning_rate": 1.1009042931438784e-06, + "loss": 0.3452, + "step": 2046 + }, + { + "epoch": 0.79, + "grad_norm": 3.413240832614784, + "learning_rate": 1.0969865949397902e-06, + "loss": 0.2696, + "step": 2047 + }, + { + "epoch": 0.79, + "grad_norm": 5.254044799732455, + "learning_rate": 1.0930750208694535e-06, + "loss": 0.2523, + "step": 2048 + }, + { + "epoch": 0.79, + "grad_norm": 4.81778679253413, + "learning_rate": 1.0891695770704341e-06, + "loss": 0.1174, + "step": 2049 + }, + { + "epoch": 0.79, + "grad_norm": 4.677530913602758, + "learning_rate": 1.0852702696706807e-06, + "loss": 0.5205, + "step": 2050 + }, + { + "epoch": 0.79, + "grad_norm": 5.609905228090157, + "learning_rate": 1.0813771047885107e-06, + "loss": 0.3916, + "step": 2051 + }, + { + "epoch": 0.79, + "grad_norm": 3.7234916972882646, + "learning_rate": 1.077490088532605e-06, + "loss": 0.2206, + "step": 2052 + }, + { + "epoch": 0.79, + "grad_norm": 3.398685893251061, + "learning_rate": 1.0736092270019987e-06, + "loss": 0.3175, + "step": 2053 + }, + { + "epoch": 0.79, + "grad_norm": 3.5592097414637895, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.1735, + "step": 2054 + }, + { + "epoch": 0.79, + "grad_norm": 3.9940841160069587, + "learning_rate": 1.0658659924645109e-06, + "loss": 0.3928, + "step": 2055 + }, + { + "epoch": 0.79, + "grad_norm": 4.511446606912383, + "learning_rate": 1.062003631607373e-06, + "loss": 0.2891, + "step": 2056 + }, + { + "epoch": 0.8, + "grad_norm": 4.2704572363188085, + "learning_rate": 1.0581474497749965e-06, + "loss": 0.2182, + "step": 2057 + }, + { + "epoch": 0.8, + "grad_norm": 6.939171355285332, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.5411, + "step": 2058 + }, + { + "epoch": 0.8, + "grad_norm": 3.91519392759276, + "learning_rate": 1.050453647377428e-06, + "loss": 0.2355, + "step": 2059 + }, + { + "epoch": 0.8, + "grad_norm": 3.2279221254710553, + "learning_rate": 1.0466160388844149e-06, + "loss": 0.2909, + "step": 2060 + }, + { + "epoch": 0.8, + "grad_norm": 4.413789562276128, + "learning_rate": 1.0427846335605012e-06, + "loss": 0.381, + "step": 2061 + }, + { + "epoch": 0.8, + "grad_norm": 4.092313236446898, + "learning_rate": 1.0389594374174628e-06, + "loss": 0.3005, + "step": 2062 + }, + { + "epoch": 0.8, + "grad_norm": 5.197869963235752, + "learning_rate": 1.0351404564573314e-06, + "loss": 0.3983, + "step": 2063 + }, + { + "epoch": 0.8, + "grad_norm": 3.8493024986407303, + "learning_rate": 1.0313276966723867e-06, + "loss": 0.4979, + "step": 2064 + }, + { + "epoch": 0.8, + "grad_norm": 4.846811971577239, + "learning_rate": 1.0275211640451488e-06, + "loss": 0.2574, + "step": 2065 + }, + { + "epoch": 0.8, + "grad_norm": 5.555778848514527, + "learning_rate": 1.0237208645483648e-06, + "loss": 0.4752, + "step": 2066 + }, + { + "epoch": 0.8, + "grad_norm": 3.53702990137847, + "learning_rate": 1.0199268041450023e-06, + "loss": 0.4165, + "step": 2067 + }, + { + "epoch": 0.8, + "grad_norm": 5.716237784894001, + "learning_rate": 1.0161389887882394e-06, + "loss": 0.2189, + "step": 2068 + }, + { + "epoch": 0.8, + "grad_norm": 3.493212914863018, + "learning_rate": 1.0123574244214552e-06, + "loss": 0.38, + "step": 2069 + }, + { + "epoch": 0.8, + "grad_norm": 3.627727168634854, + "learning_rate": 1.00858211697822e-06, + "loss": 0.4674, + "step": 2070 + }, + { + "epoch": 0.8, + "grad_norm": 5.008450903032346, + "learning_rate": 1.0048130723822874e-06, + "loss": 0.2886, + "step": 2071 + }, + { + "epoch": 0.8, + "grad_norm": 3.2159768889456473, + "learning_rate": 1.0010502965475843e-06, + "loss": 0.1351, + "step": 2072 + }, + { + "epoch": 0.8, + "grad_norm": 4.164295800423085, + "learning_rate": 9.972937953781985e-07, + "loss": 0.3149, + "step": 2073 + }, + { + "epoch": 0.8, + "grad_norm": 3.5698815129316523, + "learning_rate": 9.935435747683758e-07, + "loss": 0.3081, + "step": 2074 + }, + { + "epoch": 0.8, + "grad_norm": 2.9989737239095793, + "learning_rate": 9.897996406025062e-07, + "loss": 0.0709, + "step": 2075 + }, + { + "epoch": 0.8, + "grad_norm": 4.27243695334749, + "learning_rate": 9.860619987551157e-07, + "loss": 0.1984, + "step": 2076 + }, + { + "epoch": 0.8, + "grad_norm": 4.137489306606834, + "learning_rate": 9.823306550908568e-07, + "loss": 0.4305, + "step": 2077 + }, + { + "epoch": 0.8, + "grad_norm": 3.6600039547541012, + "learning_rate": 9.786056154645001e-07, + "loss": 0.1008, + "step": 2078 + }, + { + "epoch": 0.8, + "grad_norm": 2.5358824541027305, + "learning_rate": 9.74886885720925e-07, + "loss": 0.4141, + "step": 2079 + }, + { + "epoch": 0.8, + "grad_norm": 4.37324014539792, + "learning_rate": 9.711744716951093e-07, + "loss": 0.2516, + "step": 2080 + }, + { + "epoch": 0.8, + "grad_norm": 4.461854105449941, + "learning_rate": 9.674683792121214e-07, + "loss": 0.2222, + "step": 2081 + }, + { + "epoch": 0.8, + "grad_norm": 3.9228980377943365, + "learning_rate": 9.637686140871121e-07, + "loss": 0.417, + "step": 2082 + }, + { + "epoch": 0.81, + "grad_norm": 3.0523698451974934, + "learning_rate": 9.600751821252996e-07, + "loss": 0.0772, + "step": 2083 + }, + { + "epoch": 0.81, + "grad_norm": 4.519691539735653, + "learning_rate": 9.563880891219702e-07, + "loss": 0.2497, + "step": 2084 + }, + { + "epoch": 0.81, + "grad_norm": 3.1123399819194137, + "learning_rate": 9.52707340862461e-07, + "loss": 0.247, + "step": 2085 + }, + { + "epoch": 0.81, + "grad_norm": 3.0621168607130396, + "learning_rate": 9.490329431221545e-07, + "loss": 0.342, + "step": 2086 + }, + { + "epoch": 0.81, + "grad_norm": 3.65955452278162, + "learning_rate": 9.45364901666469e-07, + "loss": 0.4608, + "step": 2087 + }, + { + "epoch": 0.81, + "grad_norm": 6.044479388675468, + "learning_rate": 9.417032222508476e-07, + "loss": 0.3426, + "step": 2088 + }, + { + "epoch": 0.81, + "grad_norm": 5.106807424334697, + "learning_rate": 9.380479106207535e-07, + "loss": 0.6374, + "step": 2089 + }, + { + "epoch": 0.81, + "grad_norm": 4.095472272297561, + "learning_rate": 9.34398972511656e-07, + "loss": 0.4042, + "step": 2090 + }, + { + "epoch": 0.81, + "grad_norm": 3.6407607405805673, + "learning_rate": 9.307564136490255e-07, + "loss": 0.4477, + "step": 2091 + }, + { + "epoch": 0.81, + "grad_norm": 5.029536211055726, + "learning_rate": 9.271202397483214e-07, + "loss": 0.168, + "step": 2092 + }, + { + "epoch": 0.81, + "grad_norm": 3.1264121192362637, + "learning_rate": 9.234904565149866e-07, + "loss": 0.1865, + "step": 2093 + }, + { + "epoch": 0.81, + "grad_norm": 4.600785134736331, + "learning_rate": 9.198670696444339e-07, + "loss": 0.569, + "step": 2094 + }, + { + "epoch": 0.81, + "grad_norm": 6.354844649574498, + "learning_rate": 9.162500848220418e-07, + "loss": 0.3773, + "step": 2095 + }, + { + "epoch": 0.81, + "grad_norm": 3.9763569877557283, + "learning_rate": 9.126395077231432e-07, + "loss": 0.5987, + "step": 2096 + }, + { + "epoch": 0.81, + "grad_norm": 4.670891860269174, + "learning_rate": 9.090353440130156e-07, + "loss": 0.3412, + "step": 2097 + }, + { + "epoch": 0.81, + "grad_norm": 3.9757048622207876, + "learning_rate": 9.054375993468745e-07, + "loss": 0.1836, + "step": 2098 + }, + { + "epoch": 0.81, + "grad_norm": 3.8169569598598896, + "learning_rate": 9.018462793698629e-07, + "loss": 0.2635, + "step": 2099 + }, + { + "epoch": 0.81, + "grad_norm": 2.5066396076847877, + "learning_rate": 8.982613897170439e-07, + "loss": 0.2857, + "step": 2100 + }, + { + "epoch": 0.81, + "grad_norm": 4.574040814370678, + "learning_rate": 8.946829360133891e-07, + "loss": 0.1768, + "step": 2101 + }, + { + "epoch": 0.81, + "grad_norm": 3.06970750749417, + "learning_rate": 8.911109238737748e-07, + "loss": 0.2801, + "step": 2102 + }, + { + "epoch": 0.81, + "grad_norm": 4.478024686652142, + "learning_rate": 8.875453589029648e-07, + "loss": 0.1764, + "step": 2103 + }, + { + "epoch": 0.81, + "grad_norm": 3.507724182930931, + "learning_rate": 8.839862466956112e-07, + "loss": 0.3428, + "step": 2104 + }, + { + "epoch": 0.81, + "grad_norm": 3.46631075228589, + "learning_rate": 8.804335928362401e-07, + "loss": 0.3971, + "step": 2105 + }, + { + "epoch": 0.81, + "grad_norm": 3.728797916190673, + "learning_rate": 8.768874028992431e-07, + "loss": 0.2106, + "step": 2106 + }, + { + "epoch": 0.81, + "grad_norm": 4.01213483599429, + "learning_rate": 8.733476824488707e-07, + "loss": 0.3977, + "step": 2107 + }, + { + "epoch": 0.81, + "grad_norm": 3.8858479135518835, + "learning_rate": 8.698144370392209e-07, + "loss": 0.3053, + "step": 2108 + }, + { + "epoch": 0.82, + "grad_norm": 4.721128699419567, + "learning_rate": 8.662876722142327e-07, + "loss": 0.3185, + "step": 2109 + }, + { + "epoch": 0.82, + "grad_norm": 4.466868520310928, + "learning_rate": 8.627673935076769e-07, + "loss": 0.1758, + "step": 2110 + }, + { + "epoch": 0.82, + "grad_norm": 5.433307332884818, + "learning_rate": 8.592536064431467e-07, + "loss": 0.4071, + "step": 2111 + }, + { + "epoch": 0.82, + "grad_norm": 5.134998907112914, + "learning_rate": 8.557463165340479e-07, + "loss": 0.2446, + "step": 2112 + }, + { + "epoch": 0.82, + "grad_norm": 6.00151766537157, + "learning_rate": 8.522455292835935e-07, + "loss": 0.1392, + "step": 2113 + }, + { + "epoch": 0.82, + "grad_norm": 4.4654451968116975, + "learning_rate": 8.487512501847933e-07, + "loss": 0.4149, + "step": 2114 + }, + { + "epoch": 0.82, + "grad_norm": 5.561830986099788, + "learning_rate": 8.452634847204449e-07, + "loss": 0.376, + "step": 2115 + }, + { + "epoch": 0.82, + "grad_norm": 4.605604282870342, + "learning_rate": 8.417822383631258e-07, + "loss": 0.4221, + "step": 2116 + }, + { + "epoch": 0.82, + "grad_norm": 6.305985198147468, + "learning_rate": 8.38307516575183e-07, + "loss": 0.4552, + "step": 2117 + }, + { + "epoch": 0.82, + "grad_norm": 4.308173256421205, + "learning_rate": 8.348393248087289e-07, + "loss": 0.2849, + "step": 2118 + }, + { + "epoch": 0.82, + "grad_norm": 6.649060332381415, + "learning_rate": 8.313776685056262e-07, + "loss": 0.3408, + "step": 2119 + }, + { + "epoch": 0.82, + "grad_norm": 4.388896373470471, + "learning_rate": 8.279225530974866e-07, + "loss": 0.193, + "step": 2120 + }, + { + "epoch": 0.82, + "grad_norm": 1.7355215497481191, + "learning_rate": 8.244739840056554e-07, + "loss": 0.1914, + "step": 2121 + }, + { + "epoch": 0.82, + "grad_norm": 5.410760774134362, + "learning_rate": 8.210319666412087e-07, + "loss": 0.3838, + "step": 2122 + }, + { + "epoch": 0.82, + "grad_norm": 5.937757317673456, + "learning_rate": 8.175965064049401e-07, + "loss": 0.3069, + "step": 2123 + }, + { + "epoch": 0.82, + "grad_norm": 5.188973863798863, + "learning_rate": 8.141676086873574e-07, + "loss": 0.4484, + "step": 2124 + }, + { + "epoch": 0.82, + "grad_norm": 3.1448707670417018, + "learning_rate": 8.107452788686681e-07, + "loss": 0.3897, + "step": 2125 + }, + { + "epoch": 0.82, + "grad_norm": 2.79337077314214, + "learning_rate": 8.073295223187766e-07, + "loss": 0.1276, + "step": 2126 + }, + { + "epoch": 0.82, + "grad_norm": 3.114734385974199, + "learning_rate": 8.039203443972726e-07, + "loss": 0.1251, + "step": 2127 + }, + { + "epoch": 0.82, + "grad_norm": 3.8419358936248424, + "learning_rate": 8.005177504534228e-07, + "loss": 0.0652, + "step": 2128 + }, + { + "epoch": 0.82, + "grad_norm": 5.163174007698943, + "learning_rate": 7.971217458261643e-07, + "loss": 0.3861, + "step": 2129 + }, + { + "epoch": 0.82, + "grad_norm": 3.644778096748403, + "learning_rate": 7.937323358440935e-07, + "loss": 0.0956, + "step": 2130 + }, + { + "epoch": 0.82, + "grad_norm": 3.9069050437062187, + "learning_rate": 7.903495258254624e-07, + "loss": 0.3901, + "step": 2131 + }, + { + "epoch": 0.82, + "grad_norm": 6.17563498329954, + "learning_rate": 7.869733210781611e-07, + "loss": 0.396, + "step": 2132 + }, + { + "epoch": 0.82, + "grad_norm": 3.3817413649102677, + "learning_rate": 7.836037268997221e-07, + "loss": 0.1037, + "step": 2133 + }, + { + "epoch": 0.83, + "grad_norm": 3.6837222956817044, + "learning_rate": 7.802407485773011e-07, + "loss": 0.2575, + "step": 2134 + }, + { + "epoch": 0.83, + "grad_norm": 3.2467913594081104, + "learning_rate": 7.768843913876756e-07, + "loss": 0.1036, + "step": 2135 + }, + { + "epoch": 0.83, + "grad_norm": 3.164768780656091, + "learning_rate": 7.735346605972322e-07, + "loss": 0.3418, + "step": 2136 + }, + { + "epoch": 0.83, + "grad_norm": 3.20933849417798, + "learning_rate": 7.701915614619615e-07, + "loss": 0.1194, + "step": 2137 + }, + { + "epoch": 0.83, + "grad_norm": 4.441401383440821, + "learning_rate": 7.668550992274476e-07, + "loss": 0.3822, + "step": 2138 + }, + { + "epoch": 0.83, + "grad_norm": 5.435654312501965, + "learning_rate": 7.635252791288611e-07, + "loss": 0.309, + "step": 2139 + }, + { + "epoch": 0.83, + "grad_norm": 4.513225623672635, + "learning_rate": 7.60202106390951e-07, + "loss": 0.181, + "step": 2140 + }, + { + "epoch": 0.83, + "grad_norm": 7.121618568580262, + "learning_rate": 7.568855862280356e-07, + "loss": 0.4322, + "step": 2141 + }, + { + "epoch": 0.83, + "grad_norm": 4.069097365037233, + "learning_rate": 7.535757238439939e-07, + "loss": 0.2743, + "step": 2142 + }, + { + "epoch": 0.83, + "grad_norm": 5.037148328924976, + "learning_rate": 7.502725244322596e-07, + "loss": 0.3722, + "step": 2143 + }, + { + "epoch": 0.83, + "grad_norm": 4.769759623713564, + "learning_rate": 7.469759931758109e-07, + "loss": 0.1261, + "step": 2144 + }, + { + "epoch": 0.83, + "grad_norm": 4.6513560243308065, + "learning_rate": 7.436861352471647e-07, + "loss": 0.3153, + "step": 2145 + }, + { + "epoch": 0.83, + "grad_norm": 3.622277249400289, + "learning_rate": 7.404029558083653e-07, + "loss": 0.1517, + "step": 2146 + }, + { + "epoch": 0.83, + "grad_norm": 6.282837150432579, + "learning_rate": 7.371264600109779e-07, + "loss": 0.4481, + "step": 2147 + }, + { + "epoch": 0.83, + "grad_norm": 4.6116691472748395, + "learning_rate": 7.338566529960817e-07, + "loss": 0.1992, + "step": 2148 + }, + { + "epoch": 0.83, + "grad_norm": 3.881261732047941, + "learning_rate": 7.305935398942598e-07, + "loss": 0.1339, + "step": 2149 + }, + { + "epoch": 0.83, + "grad_norm": 2.5362180276910373, + "learning_rate": 7.273371258255923e-07, + "loss": 0.281, + "step": 2150 + }, + { + "epoch": 0.83, + "grad_norm": 5.231597373086418, + "learning_rate": 7.240874158996475e-07, + "loss": 0.5531, + "step": 2151 + }, + { + "epoch": 0.83, + "grad_norm": 4.371128302847014, + "learning_rate": 7.208444152154759e-07, + "loss": 0.1399, + "step": 2152 + }, + { + "epoch": 0.83, + "grad_norm": 4.358173010504196, + "learning_rate": 7.176081288615983e-07, + "loss": 0.135, + "step": 2153 + }, + { + "epoch": 0.83, + "grad_norm": 4.562187126615553, + "learning_rate": 7.143785619160026e-07, + "loss": 0.2225, + "step": 2154 + }, + { + "epoch": 0.83, + "grad_norm": 4.248814129365013, + "learning_rate": 7.11155719446131e-07, + "loss": 0.5118, + "step": 2155 + }, + { + "epoch": 0.83, + "grad_norm": 3.7086853661453736, + "learning_rate": 7.079396065088773e-07, + "loss": 0.4558, + "step": 2156 + }, + { + "epoch": 0.83, + "grad_norm": 5.570030705304649, + "learning_rate": 7.047302281505735e-07, + "loss": 0.426, + "step": 2157 + }, + { + "epoch": 0.83, + "grad_norm": 5.286519515007753, + "learning_rate": 7.015275894069862e-07, + "loss": 0.1149, + "step": 2158 + }, + { + "epoch": 0.83, + "grad_norm": 3.0759124027897693, + "learning_rate": 6.983316953033064e-07, + "loss": 0.089, + "step": 2159 + }, + { + "epoch": 0.84, + "grad_norm": 4.533310420852684, + "learning_rate": 6.951425508541432e-07, + "loss": 0.3715, + "step": 2160 + }, + { + "epoch": 0.84, + "grad_norm": 4.966910003663425, + "learning_rate": 6.919601610635118e-07, + "loss": 0.2148, + "step": 2161 + }, + { + "epoch": 0.84, + "grad_norm": 4.265341662817313, + "learning_rate": 6.887845309248326e-07, + "loss": 0.1649, + "step": 2162 + }, + { + "epoch": 0.84, + "grad_norm": 4.2034762310858405, + "learning_rate": 6.856156654209173e-07, + "loss": 0.1545, + "step": 2163 + }, + { + "epoch": 0.84, + "grad_norm": 5.164156647443338, + "learning_rate": 6.824535695239643e-07, + "loss": 0.2189, + "step": 2164 + }, + { + "epoch": 0.84, + "grad_norm": 4.23657078796661, + "learning_rate": 6.792982481955502e-07, + "loss": 0.2903, + "step": 2165 + }, + { + "epoch": 0.84, + "grad_norm": 3.343945320000238, + "learning_rate": 6.761497063866207e-07, + "loss": 0.3613, + "step": 2166 + }, + { + "epoch": 0.84, + "grad_norm": 4.113784060762545, + "learning_rate": 6.730079490374852e-07, + "loss": 0.2078, + "step": 2167 + }, + { + "epoch": 0.84, + "grad_norm": 5.238462587662076, + "learning_rate": 6.698729810778065e-07, + "loss": 0.4055, + "step": 2168 + }, + { + "epoch": 0.84, + "grad_norm": 4.73984052317012, + "learning_rate": 6.667448074265954e-07, + "loss": 0.496, + "step": 2169 + }, + { + "epoch": 0.84, + "grad_norm": 3.7840827126001346, + "learning_rate": 6.63623432992202e-07, + "loss": 0.4575, + "step": 2170 + }, + { + "epoch": 0.84, + "grad_norm": 3.6539429361993143, + "learning_rate": 6.605088626723055e-07, + "loss": 0.1113, + "step": 2171 + }, + { + "epoch": 0.84, + "grad_norm": 4.061510014960234, + "learning_rate": 6.574011013539111e-07, + "loss": 0.2968, + "step": 2172 + }, + { + "epoch": 0.84, + "grad_norm": 6.691724520906602, + "learning_rate": 6.543001539133409e-07, + "loss": 0.4197, + "step": 2173 + }, + { + "epoch": 0.84, + "grad_norm": 3.0898603323675964, + "learning_rate": 6.512060252162228e-07, + "loss": 0.2099, + "step": 2174 + }, + { + "epoch": 0.84, + "grad_norm": 3.893812609554106, + "learning_rate": 6.481187201174888e-07, + "loss": 0.3421, + "step": 2175 + }, + { + "epoch": 0.84, + "grad_norm": 3.454004315665616, + "learning_rate": 6.450382434613612e-07, + "loss": 0.1934, + "step": 2176 + }, + { + "epoch": 0.84, + "grad_norm": 5.0834905530575485, + "learning_rate": 6.41964600081349e-07, + "loss": 0.2604, + "step": 2177 + }, + { + "epoch": 0.84, + "grad_norm": 3.631006931804685, + "learning_rate": 6.388977948002406e-07, + "loss": 0.2214, + "step": 2178 + }, + { + "epoch": 0.84, + "grad_norm": 3.9515344707274664, + "learning_rate": 6.358378324300929e-07, + "loss": 0.0976, + "step": 2179 + }, + { + "epoch": 0.84, + "grad_norm": 2.8578028818205463, + "learning_rate": 6.327847177722274e-07, + "loss": 0.1415, + "step": 2180 + }, + { + "epoch": 0.84, + "grad_norm": 4.250405384395825, + "learning_rate": 6.297384556172176e-07, + "loss": 0.2038, + "step": 2181 + }, + { + "epoch": 0.84, + "grad_norm": 5.912978693166093, + "learning_rate": 6.2669905074489e-07, + "loss": 0.3054, + "step": 2182 + }, + { + "epoch": 0.84, + "grad_norm": 3.2949957962456864, + "learning_rate": 6.236665079243087e-07, + "loss": 0.2979, + "step": 2183 + }, + { + "epoch": 0.84, + "grad_norm": 4.171347472760072, + "learning_rate": 6.206408319137703e-07, + "loss": 0.4694, + "step": 2184 + }, + { + "epoch": 0.84, + "grad_norm": 4.075732236688332, + "learning_rate": 6.176220274607975e-07, + "loss": 0.4996, + "step": 2185 + }, + { + "epoch": 0.85, + "grad_norm": 4.240542025206586, + "learning_rate": 6.146100993021308e-07, + "loss": 0.1871, + "step": 2186 + }, + { + "epoch": 0.85, + "grad_norm": 3.0951669262417156, + "learning_rate": 6.116050521637218e-07, + "loss": 0.3241, + "step": 2187 + }, + { + "epoch": 0.85, + "grad_norm": 3.55318070900083, + "learning_rate": 6.086068907607245e-07, + "loss": 0.1071, + "step": 2188 + }, + { + "epoch": 0.85, + "grad_norm": 3.6878309228777213, + "learning_rate": 6.056156197974888e-07, + "loss": 0.2196, + "step": 2189 + }, + { + "epoch": 0.85, + "grad_norm": 4.165476001138745, + "learning_rate": 6.026312439675553e-07, + "loss": 0.2738, + "step": 2190 + }, + { + "epoch": 0.85, + "grad_norm": 3.904441308239979, + "learning_rate": 5.996537679536401e-07, + "loss": 0.2987, + "step": 2191 + }, + { + "epoch": 0.85, + "grad_norm": 4.166171824537586, + "learning_rate": 5.966831964276376e-07, + "loss": 0.4376, + "step": 2192 + }, + { + "epoch": 0.85, + "grad_norm": 4.752733155157431, + "learning_rate": 5.93719534050608e-07, + "loss": 0.182, + "step": 2193 + }, + { + "epoch": 0.85, + "grad_norm": 5.1621407767764165, + "learning_rate": 5.907627854727688e-07, + "loss": 0.4705, + "step": 2194 + }, + { + "epoch": 0.85, + "grad_norm": 4.413922763072018, + "learning_rate": 5.878129553334905e-07, + "loss": 0.4403, + "step": 2195 + }, + { + "epoch": 0.85, + "grad_norm": 1.6748962070067934, + "learning_rate": 5.848700482612873e-07, + "loss": 0.1076, + "step": 2196 + }, + { + "epoch": 0.85, + "grad_norm": 3.658284669907072, + "learning_rate": 5.819340688738118e-07, + "loss": 0.3573, + "step": 2197 + }, + { + "epoch": 0.85, + "grad_norm": 4.244987066843069, + "learning_rate": 5.790050217778442e-07, + "loss": 0.292, + "step": 2198 + }, + { + "epoch": 0.85, + "grad_norm": 3.610341401184943, + "learning_rate": 5.760829115692907e-07, + "loss": 0.1896, + "step": 2199 + }, + { + "epoch": 0.85, + "grad_norm": 4.377775047650545, + "learning_rate": 5.7316774283317e-07, + "loss": 0.3236, + "step": 2200 + }, + { + "epoch": 0.85, + "grad_norm": 2.779888792861332, + "learning_rate": 5.702595201436101e-07, + "loss": 0.3476, + "step": 2201 + }, + { + "epoch": 0.85, + "grad_norm": 4.007938371187252, + "learning_rate": 5.673582480638395e-07, + "loss": 0.4435, + "step": 2202 + }, + { + "epoch": 0.85, + "grad_norm": 4.151679689529081, + "learning_rate": 5.64463931146183e-07, + "loss": 0.2445, + "step": 2203 + }, + { + "epoch": 0.85, + "grad_norm": 3.8556135935279046, + "learning_rate": 5.615765739320494e-07, + "loss": 0.1526, + "step": 2204 + }, + { + "epoch": 0.85, + "grad_norm": 5.048550000357814, + "learning_rate": 5.586961809519287e-07, + "loss": 0.319, + "step": 2205 + }, + { + "epoch": 0.85, + "grad_norm": 3.6271400582880586, + "learning_rate": 5.558227567253832e-07, + "loss": 0.0709, + "step": 2206 + }, + { + "epoch": 0.85, + "grad_norm": 4.081379093376282, + "learning_rate": 5.529563057610399e-07, + "loss": 0.1807, + "step": 2207 + }, + { + "epoch": 0.85, + "grad_norm": 3.6094950199211286, + "learning_rate": 5.500968325565859e-07, + "loss": 0.3581, + "step": 2208 + }, + { + "epoch": 0.85, + "grad_norm": 5.397011302400871, + "learning_rate": 5.472443415987594e-07, + "loss": 0.404, + "step": 2209 + }, + { + "epoch": 0.85, + "grad_norm": 4.449418872569559, + "learning_rate": 5.443988373633397e-07, + "loss": 0.3493, + "step": 2210 + }, + { + "epoch": 0.85, + "grad_norm": 3.7212599348359543, + "learning_rate": 5.415603243151469e-07, + "loss": 0.1638, + "step": 2211 + }, + { + "epoch": 0.86, + "grad_norm": 5.101584966107421, + "learning_rate": 5.387288069080298e-07, + "loss": 0.4121, + "step": 2212 + }, + { + "epoch": 0.86, + "grad_norm": 6.726380754203162, + "learning_rate": 5.359042895848626e-07, + "loss": 0.28, + "step": 2213 + }, + { + "epoch": 0.86, + "grad_norm": 3.52379869067051, + "learning_rate": 5.330867767775333e-07, + "loss": 0.1494, + "step": 2214 + }, + { + "epoch": 0.86, + "grad_norm": 3.6137579601650396, + "learning_rate": 5.302762729069399e-07, + "loss": 0.1219, + "step": 2215 + }, + { + "epoch": 0.86, + "grad_norm": 2.6887392408225614, + "learning_rate": 5.274727823829839e-07, + "loss": 0.0591, + "step": 2216 + }, + { + "epoch": 0.86, + "grad_norm": 7.291731173000995, + "learning_rate": 5.246763096045604e-07, + "loss": 0.3837, + "step": 2217 + }, + { + "epoch": 0.86, + "grad_norm": 6.494925188042238, + "learning_rate": 5.218868589595555e-07, + "loss": 0.3388, + "step": 2218 + }, + { + "epoch": 0.86, + "grad_norm": 5.045719425080615, + "learning_rate": 5.191044348248358e-07, + "loss": 0.5384, + "step": 2219 + }, + { + "epoch": 0.86, + "grad_norm": 4.583416393498012, + "learning_rate": 5.163290415662408e-07, + "loss": 0.1205, + "step": 2220 + }, + { + "epoch": 0.86, + "grad_norm": 5.1126018660664005, + "learning_rate": 5.13560683538582e-07, + "loss": 0.3394, + "step": 2221 + }, + { + "epoch": 0.86, + "grad_norm": 2.602219295464481, + "learning_rate": 5.107993650856285e-07, + "loss": 0.2336, + "step": 2222 + }, + { + "epoch": 0.86, + "grad_norm": 4.4779864062241925, + "learning_rate": 5.080450905401057e-07, + "loss": 0.3208, + "step": 2223 + }, + { + "epoch": 0.86, + "grad_norm": 5.221824269737294, + "learning_rate": 5.052978642236866e-07, + "loss": 0.1526, + "step": 2224 + }, + { + "epoch": 0.86, + "grad_norm": 3.976218441578814, + "learning_rate": 5.025576904469842e-07, + "loss": 0.4964, + "step": 2225 + }, + { + "epoch": 0.86, + "grad_norm": 3.0440285075835165, + "learning_rate": 4.998245735095459e-07, + "loss": 0.5404, + "step": 2226 + }, + { + "epoch": 0.86, + "grad_norm": 5.0172220978893245, + "learning_rate": 4.970985176998455e-07, + "loss": 0.2748, + "step": 2227 + }, + { + "epoch": 0.86, + "grad_norm": 3.76160128353007, + "learning_rate": 4.943795272952795e-07, + "loss": 0.2733, + "step": 2228 + }, + { + "epoch": 0.86, + "grad_norm": 3.1256717890706294, + "learning_rate": 4.916676065621562e-07, + "loss": 0.1491, + "step": 2229 + }, + { + "epoch": 0.86, + "grad_norm": 3.3672764462319758, + "learning_rate": 4.889627597556911e-07, + "loss": 0.1067, + "step": 2230 + }, + { + "epoch": 0.86, + "grad_norm": 3.6672614940569095, + "learning_rate": 4.862649911200007e-07, + "loss": 0.0883, + "step": 2231 + }, + { + "epoch": 0.86, + "grad_norm": 3.8905968351187106, + "learning_rate": 4.835743048880959e-07, + "loss": 0.3253, + "step": 2232 + }, + { + "epoch": 0.86, + "grad_norm": 4.977738056689484, + "learning_rate": 4.80890705281874e-07, + "loss": 0.3721, + "step": 2233 + }, + { + "epoch": 0.86, + "grad_norm": 4.935860819344814, + "learning_rate": 4.782141965121129e-07, + "loss": 0.1413, + "step": 2234 + }, + { + "epoch": 0.86, + "grad_norm": 4.77547836114163, + "learning_rate": 4.7554478277846427e-07, + "loss": 0.3436, + "step": 2235 + }, + { + "epoch": 0.86, + "grad_norm": 2.9662639339985204, + "learning_rate": 4.7288246826944826e-07, + "loss": 0.2761, + "step": 2236 + }, + { + "epoch": 0.86, + "grad_norm": 5.088850430302421, + "learning_rate": 4.7022725716244387e-07, + "loss": 0.268, + "step": 2237 + }, + { + "epoch": 0.87, + "grad_norm": 4.229226765216043, + "learning_rate": 4.6757915362368567e-07, + "loss": 0.1782, + "step": 2238 + }, + { + "epoch": 0.87, + "grad_norm": 4.349486192130572, + "learning_rate": 4.64938161808256e-07, + "loss": 0.2291, + "step": 2239 + }, + { + "epoch": 0.87, + "grad_norm": 2.7510258273627577, + "learning_rate": 4.6230428586007734e-07, + "loss": 0.1063, + "step": 2240 + }, + { + "epoch": 0.87, + "grad_norm": 4.317631021090621, + "learning_rate": 4.596775299119066e-07, + "loss": 0.2341, + "step": 2241 + }, + { + "epoch": 0.87, + "grad_norm": 3.8447583158308825, + "learning_rate": 4.570578980853302e-07, + "loss": 0.4979, + "step": 2242 + }, + { + "epoch": 0.87, + "grad_norm": 3.930941806279006, + "learning_rate": 4.5444539449075677e-07, + "loss": 0.2828, + "step": 2243 + }, + { + "epoch": 0.87, + "grad_norm": 3.5638729630919426, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.3164, + "step": 2244 + }, + { + "epoch": 0.87, + "grad_norm": 5.792267002535738, + "learning_rate": 4.4924178838331554e-07, + "loss": 0.2149, + "step": 2245 + }, + { + "epoch": 0.87, + "grad_norm": 2.9765455635327998, + "learning_rate": 4.466506940353138e-07, + "loss": 0.1257, + "step": 2246 + }, + { + "epoch": 0.87, + "grad_norm": 4.449574773706834, + "learning_rate": 4.4406674424903264e-07, + "loss": 0.4303, + "step": 2247 + }, + { + "epoch": 0.87, + "grad_norm": 3.072806953412824, + "learning_rate": 4.414899430788916e-07, + "loss": 0.1642, + "step": 2248 + }, + { + "epoch": 0.87, + "grad_norm": 4.209866210186225, + "learning_rate": 4.389202945680943e-07, + "loss": 0.3411, + "step": 2249 + }, + { + "epoch": 0.87, + "grad_norm": 3.5137050490375983, + "learning_rate": 4.363578027486187e-07, + "loss": 0.3249, + "step": 2250 + }, + { + "epoch": 0.87, + "grad_norm": 6.479277042576117, + "learning_rate": 4.338024716412165e-07, + "loss": 0.3783, + "step": 2251 + }, + { + "epoch": 0.87, + "grad_norm": 3.3536186914531054, + "learning_rate": 4.3125430525540193e-07, + "loss": 0.0922, + "step": 2252 + }, + { + "epoch": 0.87, + "grad_norm": 3.45840467049957, + "learning_rate": 4.2871330758944786e-07, + "loss": 0.1824, + "step": 2253 + }, + { + "epoch": 0.87, + "grad_norm": 2.8852710219387414, + "learning_rate": 4.261794826303783e-07, + "loss": 0.0863, + "step": 2254 + }, + { + "epoch": 0.87, + "grad_norm": 2.9884406942216013, + "learning_rate": 4.2365283435396366e-07, + "loss": 0.1911, + "step": 2255 + }, + { + "epoch": 0.87, + "grad_norm": 4.900171809048658, + "learning_rate": 4.211333667247125e-07, + "loss": 0.221, + "step": 2256 + }, + { + "epoch": 0.87, + "grad_norm": 5.6534575242811975, + "learning_rate": 4.186210836958676e-07, + "loss": 0.2694, + "step": 2257 + }, + { + "epoch": 0.87, + "grad_norm": 4.7072879599529545, + "learning_rate": 4.16115989209398e-07, + "loss": 0.4097, + "step": 2258 + }, + { + "epoch": 0.87, + "grad_norm": 3.1948718631671804, + "learning_rate": 4.1361808719599163e-07, + "loss": 0.1913, + "step": 2259 + }, + { + "epoch": 0.87, + "grad_norm": 3.3393465056721294, + "learning_rate": 4.111273815750527e-07, + "loss": 0.3053, + "step": 2260 + }, + { + "epoch": 0.87, + "grad_norm": 5.035462572360308, + "learning_rate": 4.086438762546946e-07, + "loss": 0.3196, + "step": 2261 + }, + { + "epoch": 0.87, + "grad_norm": 3.898366077230296, + "learning_rate": 4.0616757513173123e-07, + "loss": 0.3597, + "step": 2262 + }, + { + "epoch": 0.87, + "grad_norm": 3.2715825445215896, + "learning_rate": 4.036984820916723e-07, + "loss": 0.3562, + "step": 2263 + }, + { + "epoch": 0.88, + "grad_norm": 7.595156067461667, + "learning_rate": 4.012366010087193e-07, + "loss": 0.2183, + "step": 2264 + }, + { + "epoch": 0.88, + "grad_norm": 3.8984367186597173, + "learning_rate": 3.9878193574575566e-07, + "loss": 0.3473, + "step": 2265 + }, + { + "epoch": 0.88, + "grad_norm": 4.033654283570532, + "learning_rate": 3.963344901543437e-07, + "loss": 0.173, + "step": 2266 + }, + { + "epoch": 0.88, + "grad_norm": 3.1131666765963493, + "learning_rate": 3.9389426807471764e-07, + "loss": 0.3853, + "step": 2267 + }, + { + "epoch": 0.88, + "grad_norm": 4.1332637878964915, + "learning_rate": 3.9146127333577757e-07, + "loss": 0.2157, + "step": 2268 + }, + { + "epoch": 0.88, + "grad_norm": 4.480222224954902, + "learning_rate": 3.8903550975508075e-07, + "loss": 0.2085, + "step": 2269 + }, + { + "epoch": 0.88, + "grad_norm": 4.230038187245582, + "learning_rate": 3.866169811388415e-07, + "loss": 0.275, + "step": 2270 + }, + { + "epoch": 0.88, + "grad_norm": 5.393893906232192, + "learning_rate": 3.8420569128192084e-07, + "loss": 0.4166, + "step": 2271 + }, + { + "epoch": 0.88, + "grad_norm": 5.230483245488447, + "learning_rate": 3.8180164396782124e-07, + "loss": 0.3383, + "step": 2272 + }, + { + "epoch": 0.88, + "grad_norm": 2.492645965140245, + "learning_rate": 3.794048429686803e-07, + "loss": 0.2577, + "step": 2273 + }, + { + "epoch": 0.88, + "grad_norm": 5.665151567340731, + "learning_rate": 3.7701529204526856e-07, + "loss": 0.184, + "step": 2274 + }, + { + "epoch": 0.88, + "grad_norm": 4.269273239781175, + "learning_rate": 3.7463299494697735e-07, + "loss": 0.2679, + "step": 2275 + }, + { + "epoch": 0.88, + "grad_norm": 4.043086433472301, + "learning_rate": 3.722579554118172e-07, + "loss": 0.461, + "step": 2276 + }, + { + "epoch": 0.88, + "grad_norm": 6.427482468742114, + "learning_rate": 3.6989017716641206e-07, + "loss": 0.4518, + "step": 2277 + }, + { + "epoch": 0.88, + "grad_norm": 3.4356947674317664, + "learning_rate": 3.675296639259912e-07, + "loss": 0.3305, + "step": 2278 + }, + { + "epoch": 0.88, + "grad_norm": 5.243042908459477, + "learning_rate": 3.6517641939438407e-07, + "loss": 0.284, + "step": 2279 + }, + { + "epoch": 0.88, + "grad_norm": 3.7151734771965774, + "learning_rate": 3.6283044726401594e-07, + "loss": 0.2837, + "step": 2280 + }, + { + "epoch": 0.88, + "grad_norm": 2.9718825048206083, + "learning_rate": 3.604917512159012e-07, + "loss": 0.1858, + "step": 2281 + }, + { + "epoch": 0.88, + "grad_norm": 3.9004321754841875, + "learning_rate": 3.581603349196372e-07, + "loss": 0.3112, + "step": 2282 + }, + { + "epoch": 0.88, + "grad_norm": 2.767243144192718, + "learning_rate": 3.5583620203339774e-07, + "loss": 0.0786, + "step": 2283 + }, + { + "epoch": 0.88, + "grad_norm": 4.902165427525528, + "learning_rate": 3.5351935620393075e-07, + "loss": 0.3765, + "step": 2284 + }, + { + "epoch": 0.88, + "grad_norm": 3.9917697911824828, + "learning_rate": 3.5120980106654825e-07, + "loss": 0.1585, + "step": 2285 + }, + { + "epoch": 0.88, + "grad_norm": 3.2721323330322116, + "learning_rate": 3.4890754024512254e-07, + "loss": 0.094, + "step": 2286 + }, + { + "epoch": 0.88, + "grad_norm": 4.610634166327765, + "learning_rate": 3.466125773520818e-07, + "loss": 0.1695, + "step": 2287 + }, + { + "epoch": 0.88, + "grad_norm": 3.4325757687783907, + "learning_rate": 3.443249159884038e-07, + "loss": 0.1344, + "step": 2288 + }, + { + "epoch": 0.88, + "grad_norm": 2.7688963559167385, + "learning_rate": 3.420445597436056e-07, + "loss": 0.2178, + "step": 2289 + }, + { + "epoch": 0.89, + "grad_norm": 4.02149252263074, + "learning_rate": 3.397715121957468e-07, + "loss": 0.2878, + "step": 2290 + }, + { + "epoch": 0.89, + "grad_norm": 3.769142990340114, + "learning_rate": 3.3750577691141596e-07, + "loss": 0.31, + "step": 2291 + }, + { + "epoch": 0.89, + "grad_norm": 4.607562051900548, + "learning_rate": 3.352473574457304e-07, + "loss": 0.3982, + "step": 2292 + }, + { + "epoch": 0.89, + "grad_norm": 5.03675486612295, + "learning_rate": 3.329962573423262e-07, + "loss": 0.4732, + "step": 2293 + }, + { + "epoch": 0.89, + "grad_norm": 4.444472483450235, + "learning_rate": 3.3075248013335614e-07, + "loss": 0.2885, + "step": 2294 + }, + { + "epoch": 0.89, + "grad_norm": 3.842348385571747, + "learning_rate": 3.2851602933948236e-07, + "loss": 0.2496, + "step": 2295 + }, + { + "epoch": 0.89, + "grad_norm": 4.7327071436136965, + "learning_rate": 3.262869084698711e-07, + "loss": 0.4226, + "step": 2296 + }, + { + "epoch": 0.89, + "grad_norm": 6.625058061867976, + "learning_rate": 3.2406512102218903e-07, + "loss": 0.2804, + "step": 2297 + }, + { + "epoch": 0.89, + "grad_norm": 4.03330180543428, + "learning_rate": 3.2185067048259245e-07, + "loss": 0.3524, + "step": 2298 + }, + { + "epoch": 0.89, + "grad_norm": 4.780213531818724, + "learning_rate": 3.1964356032572866e-07, + "loss": 0.1921, + "step": 2299 + }, + { + "epoch": 0.89, + "grad_norm": 4.947730158346107, + "learning_rate": 3.174437940147268e-07, + "loss": 0.3891, + "step": 2300 + }, + { + "epoch": 0.89, + "grad_norm": 3.5808131036732567, + "learning_rate": 3.1525137500119207e-07, + "loss": 0.1723, + "step": 2301 + }, + { + "epoch": 0.89, + "grad_norm": 4.71210086029987, + "learning_rate": 3.1306630672520153e-07, + "loss": 0.2112, + "step": 2302 + }, + { + "epoch": 0.89, + "grad_norm": 4.460327447410836, + "learning_rate": 3.108885926152988e-07, + "loss": 0.2039, + "step": 2303 + }, + { + "epoch": 0.89, + "grad_norm": 4.575085992901881, + "learning_rate": 3.087182360884872e-07, + "loss": 0.3629, + "step": 2304 + }, + { + "epoch": 0.89, + "grad_norm": 3.369756650295015, + "learning_rate": 3.06555240550227e-07, + "loss": 0.1128, + "step": 2305 + }, + { + "epoch": 0.89, + "grad_norm": 4.312354009353822, + "learning_rate": 3.0439960939442794e-07, + "loss": 0.3821, + "step": 2306 + }, + { + "epoch": 0.89, + "grad_norm": 5.321709269718386, + "learning_rate": 3.0225134600344373e-07, + "loss": 0.5004, + "step": 2307 + }, + { + "epoch": 0.89, + "grad_norm": 4.050755875630375, + "learning_rate": 3.001104537480676e-07, + "loss": 0.4332, + "step": 2308 + }, + { + "epoch": 0.89, + "grad_norm": 4.694026462183988, + "learning_rate": 2.9797693598752673e-07, + "loss": 0.387, + "step": 2309 + }, + { + "epoch": 0.89, + "grad_norm": 4.192090971482895, + "learning_rate": 2.9585079606947843e-07, + "loss": 0.5603, + "step": 2310 + }, + { + "epoch": 0.89, + "grad_norm": 5.298875139962309, + "learning_rate": 2.9373203733000234e-07, + "loss": 0.1111, + "step": 2311 + }, + { + "epoch": 0.89, + "grad_norm": 7.005963960691346, + "learning_rate": 2.916206630935969e-07, + "loss": 0.174, + "step": 2312 + }, + { + "epoch": 0.89, + "grad_norm": 3.7574263191665196, + "learning_rate": 2.895166766731744e-07, + "loss": 0.0858, + "step": 2313 + }, + { + "epoch": 0.89, + "grad_norm": 2.9971631541008374, + "learning_rate": 2.874200813700534e-07, + "loss": 0.0948, + "step": 2314 + }, + { + "epoch": 0.89, + "grad_norm": 3.786237382549601, + "learning_rate": 2.8533088047395627e-07, + "loss": 0.2424, + "step": 2315 + }, + { + "epoch": 0.9, + "grad_norm": 4.323124485530052, + "learning_rate": 2.8324907726300366e-07, + "loss": 0.223, + "step": 2316 + }, + { + "epoch": 0.9, + "grad_norm": 5.728848173573977, + "learning_rate": 2.8117467500370756e-07, + "loss": 0.5376, + "step": 2317 + }, + { + "epoch": 0.9, + "grad_norm": 3.599013605641036, + "learning_rate": 2.7910767695096707e-07, + "loss": 0.2773, + "step": 2318 + }, + { + "epoch": 0.9, + "grad_norm": 3.751904231828526, + "learning_rate": 2.77048086348064e-07, + "loss": 0.351, + "step": 2319 + }, + { + "epoch": 0.9, + "grad_norm": 2.779065873499481, + "learning_rate": 2.7499590642665773e-07, + "loss": 0.2772, + "step": 2320 + }, + { + "epoch": 0.9, + "grad_norm": 3.677818272163841, + "learning_rate": 2.729511404067797e-07, + "loss": 0.2405, + "step": 2321 + }, + { + "epoch": 0.9, + "grad_norm": 5.661325741592654, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.5917, + "step": 2322 + }, + { + "epoch": 0.9, + "grad_norm": 3.932608846333246, + "learning_rate": 2.688838628935603e-07, + "loss": 0.2891, + "step": 2323 + }, + { + "epoch": 0.9, + "grad_norm": 5.041463500009093, + "learning_rate": 2.668613577820961e-07, + "loss": 0.5847, + "step": 2324 + }, + { + "epoch": 0.9, + "grad_norm": 3.529300253653549, + "learning_rate": 2.6484627933590414e-07, + "loss": 0.3205, + "step": 2325 + }, + { + "epoch": 0.9, + "grad_norm": 4.165638196233581, + "learning_rate": 2.628386307167996e-07, + "loss": 0.4662, + "step": 2326 + }, + { + "epoch": 0.9, + "grad_norm": 4.946145270811574, + "learning_rate": 2.608384150749416e-07, + "loss": 0.3197, + "step": 2327 + }, + { + "epoch": 0.9, + "grad_norm": 6.3117858943504945, + "learning_rate": 2.5884563554882336e-07, + "loss": 0.4974, + "step": 2328 + }, + { + "epoch": 0.9, + "grad_norm": 4.303863712464286, + "learning_rate": 2.5686029526527266e-07, + "loss": 0.1986, + "step": 2329 + }, + { + "epoch": 0.9, + "grad_norm": 4.002332008584412, + "learning_rate": 2.548823973394449e-07, + "loss": 0.2956, + "step": 2330 + }, + { + "epoch": 0.9, + "grad_norm": 3.870626259973056, + "learning_rate": 2.5291194487481575e-07, + "loss": 0.1783, + "step": 2331 + }, + { + "epoch": 0.9, + "grad_norm": 5.861901439263068, + "learning_rate": 2.509489409631799e-07, + "loss": 0.401, + "step": 2332 + }, + { + "epoch": 0.9, + "grad_norm": 3.7764678082233942, + "learning_rate": 2.4899338868464404e-07, + "loss": 0.4056, + "step": 2333 + }, + { + "epoch": 0.9, + "grad_norm": 5.863567510326271, + "learning_rate": 2.470452911076227e-07, + "loss": 0.4565, + "step": 2334 + }, + { + "epoch": 0.9, + "grad_norm": 5.340754799737173, + "learning_rate": 2.4510465128883387e-07, + "loss": 0.2217, + "step": 2335 + }, + { + "epoch": 0.9, + "grad_norm": 4.470125112029207, + "learning_rate": 2.431714722732942e-07, + "loss": 0.3163, + "step": 2336 + }, + { + "epoch": 0.9, + "grad_norm": 4.874890794744219, + "learning_rate": 2.4124575709431276e-07, + "loss": 0.3827, + "step": 2337 + }, + { + "epoch": 0.9, + "grad_norm": 4.626194552062349, + "learning_rate": 2.393275087734864e-07, + "loss": 0.2556, + "step": 2338 + }, + { + "epoch": 0.9, + "grad_norm": 5.434333608965055, + "learning_rate": 2.3741673032069757e-07, + "loss": 0.6067, + "step": 2339 + }, + { + "epoch": 0.9, + "grad_norm": 3.939447195699781, + "learning_rate": 2.355134247341073e-07, + "loss": 0.3322, + "step": 2340 + }, + { + "epoch": 0.91, + "grad_norm": 3.5766429794284593, + "learning_rate": 2.3361759500015123e-07, + "loss": 0.2931, + "step": 2341 + }, + { + "epoch": 0.91, + "grad_norm": 3.481706695120485, + "learning_rate": 2.317292440935348e-07, + "loss": 0.3289, + "step": 2342 + }, + { + "epoch": 0.91, + "grad_norm": 4.365543528322596, + "learning_rate": 2.2984837497722844e-07, + "loss": 0.5175, + "step": 2343 + }, + { + "epoch": 0.91, + "grad_norm": 5.433713418680976, + "learning_rate": 2.2797499060246253e-07, + "loss": 0.513, + "step": 2344 + }, + { + "epoch": 0.91, + "grad_norm": 3.599160013829929, + "learning_rate": 2.2610909390872516e-07, + "loss": 0.0912, + "step": 2345 + }, + { + "epoch": 0.91, + "grad_norm": 2.824188414041289, + "learning_rate": 2.242506878237538e-07, + "loss": 0.0429, + "step": 2346 + }, + { + "epoch": 0.91, + "grad_norm": 5.646961121734687, + "learning_rate": 2.2239977526353263e-07, + "loss": 0.4166, + "step": 2347 + }, + { + "epoch": 0.91, + "grad_norm": 2.720538238822282, + "learning_rate": 2.2055635913228845e-07, + "loss": 0.3376, + "step": 2348 + }, + { + "epoch": 0.91, + "grad_norm": 4.976217160454024, + "learning_rate": 2.1872044232248646e-07, + "loss": 0.2981, + "step": 2349 + }, + { + "epoch": 0.91, + "grad_norm": 3.9145300557247014, + "learning_rate": 2.1689202771482344e-07, + "loss": 0.1662, + "step": 2350 + }, + { + "epoch": 0.91, + "grad_norm": 3.984116177511633, + "learning_rate": 2.1507111817822445e-07, + "loss": 0.3808, + "step": 2351 + }, + { + "epoch": 0.91, + "grad_norm": 3.6122481167406004, + "learning_rate": 2.1325771656984075e-07, + "loss": 0.0804, + "step": 2352 + }, + { + "epoch": 0.91, + "grad_norm": 3.7008062089929408, + "learning_rate": 2.1145182573504008e-07, + "loss": 0.1593, + "step": 2353 + }, + { + "epoch": 0.91, + "grad_norm": 3.421804301547307, + "learning_rate": 2.0965344850740698e-07, + "loss": 0.1981, + "step": 2354 + }, + { + "epoch": 0.91, + "grad_norm": 3.7565947537910307, + "learning_rate": 2.0786258770873647e-07, + "loss": 0.1585, + "step": 2355 + }, + { + "epoch": 0.91, + "grad_norm": 3.2498597934224285, + "learning_rate": 2.060792461490302e-07, + "loss": 0.2037, + "step": 2356 + }, + { + "epoch": 0.91, + "grad_norm": 4.155498728314941, + "learning_rate": 2.043034266264887e-07, + "loss": 0.2652, + "step": 2357 + }, + { + "epoch": 0.91, + "grad_norm": 4.749570915651954, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.4169, + "step": 2358 + }, + { + "epoch": 0.91, + "grad_norm": 3.3698109676375467, + "learning_rate": 2.0077436482669687e-07, + "loss": 0.3276, + "step": 2359 + }, + { + "epoch": 0.91, + "grad_norm": 5.525804262974156, + "learning_rate": 1.9902112808682094e-07, + "loss": 0.3643, + "step": 2360 + }, + { + "epoch": 0.91, + "grad_norm": 3.869879927514824, + "learning_rate": 1.972754244588504e-07, + "loss": 0.1856, + "step": 2361 + }, + { + "epoch": 0.91, + "grad_norm": 4.699833572027133, + "learning_rate": 1.9553725668193192e-07, + "loss": 0.3203, + "step": 2362 + }, + { + "epoch": 0.91, + "grad_norm": 5.6582337350552505, + "learning_rate": 1.9380662748338662e-07, + "loss": 0.3739, + "step": 2363 + }, + { + "epoch": 0.91, + "grad_norm": 3.7386335227610403, + "learning_rate": 1.9208353957870684e-07, + "loss": 0.1664, + "step": 2364 + }, + { + "epoch": 0.91, + "grad_norm": 3.55949737725435, + "learning_rate": 1.903679956715526e-07, + "loss": 0.354, + "step": 2365 + }, + { + "epoch": 0.91, + "grad_norm": 4.413365297085287, + "learning_rate": 1.8865999845374794e-07, + "loss": 0.3282, + "step": 2366 + }, + { + "epoch": 0.92, + "grad_norm": 4.203806054562433, + "learning_rate": 1.8695955060527292e-07, + "loss": 0.1678, + "step": 2367 + }, + { + "epoch": 0.92, + "grad_norm": 4.435695808454198, + "learning_rate": 1.8526665479426386e-07, + "loss": 0.1676, + "step": 2368 + }, + { + "epoch": 0.92, + "grad_norm": 3.9413738835117917, + "learning_rate": 1.835813136770065e-07, + "loss": 0.2952, + "step": 2369 + }, + { + "epoch": 0.92, + "grad_norm": 3.9499772409701683, + "learning_rate": 1.8190352989793325e-07, + "loss": 0.4746, + "step": 2370 + }, + { + "epoch": 0.92, + "grad_norm": 3.913242103759934, + "learning_rate": 1.8023330608961886e-07, + "loss": 0.2626, + "step": 2371 + }, + { + "epoch": 0.92, + "grad_norm": 6.537523619393519, + "learning_rate": 1.7857064487277475e-07, + "loss": 0.5212, + "step": 2372 + }, + { + "epoch": 0.92, + "grad_norm": 5.952200206049917, + "learning_rate": 1.7691554885624628e-07, + "loss": 0.2876, + "step": 2373 + }, + { + "epoch": 0.92, + "grad_norm": 3.6971037175651738, + "learning_rate": 1.7526802063700943e-07, + "loss": 0.1865, + "step": 2374 + }, + { + "epoch": 0.92, + "grad_norm": 2.9720627918683027, + "learning_rate": 1.7362806280016464e-07, + "loss": 0.0704, + "step": 2375 + }, + { + "epoch": 0.92, + "grad_norm": 4.935582362599151, + "learning_rate": 1.7199567791893524e-07, + "loss": 0.4445, + "step": 2376 + }, + { + "epoch": 0.92, + "grad_norm": 5.341700486061497, + "learning_rate": 1.7037086855465902e-07, + "loss": 0.3082, + "step": 2377 + }, + { + "epoch": 0.92, + "grad_norm": 4.243106213865727, + "learning_rate": 1.6875363725679052e-07, + "loss": 0.3016, + "step": 2378 + }, + { + "epoch": 0.92, + "grad_norm": 5.07365651043146, + "learning_rate": 1.6714398656289154e-07, + "loss": 0.3474, + "step": 2379 + }, + { + "epoch": 0.92, + "grad_norm": 3.9638957363571796, + "learning_rate": 1.655419189986307e-07, + "loss": 0.5941, + "step": 2380 + }, + { + "epoch": 0.92, + "grad_norm": 4.250470121011741, + "learning_rate": 1.6394743707777772e-07, + "loss": 0.1473, + "step": 2381 + }, + { + "epoch": 0.92, + "grad_norm": 3.567418416925741, + "learning_rate": 1.6236054330219853e-07, + "loss": 0.3394, + "step": 2382 + }, + { + "epoch": 0.92, + "grad_norm": 4.834563403433531, + "learning_rate": 1.6078124016185525e-07, + "loss": 0.3445, + "step": 2383 + }, + { + "epoch": 0.92, + "grad_norm": 4.333859975716066, + "learning_rate": 1.592095301347968e-07, + "loss": 0.4406, + "step": 2384 + }, + { + "epoch": 0.92, + "grad_norm": 3.485342996585978, + "learning_rate": 1.576454156871604e-07, + "loss": 0.2695, + "step": 2385 + }, + { + "epoch": 0.92, + "grad_norm": 3.540749889940476, + "learning_rate": 1.5608889927316407e-07, + "loss": 0.1982, + "step": 2386 + }, + { + "epoch": 0.92, + "grad_norm": 4.284222162715035, + "learning_rate": 1.54539983335103e-07, + "loss": 0.1483, + "step": 2387 + }, + { + "epoch": 0.92, + "grad_norm": 5.167960942317522, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.3977, + "step": 2388 + }, + { + "epoch": 0.92, + "grad_norm": 3.9460848430489204, + "learning_rate": 1.5146496259634103e-07, + "loss": 0.3962, + "step": 2389 + }, + { + "epoch": 0.92, + "grad_norm": 3.877961840330807, + "learning_rate": 1.4993886262058833e-07, + "loss": 0.0831, + "step": 2390 + }, + { + "epoch": 0.92, + "grad_norm": 3.3243854065830085, + "learning_rate": 1.4842037277066012e-07, + "loss": 0.1368, + "step": 2391 + }, + { + "epoch": 0.92, + "grad_norm": 3.12653640296523, + "learning_rate": 1.469094954291872e-07, + "loss": 0.3493, + "step": 2392 + }, + { + "epoch": 0.93, + "grad_norm": 3.14409200126004, + "learning_rate": 1.4540623296685374e-07, + "loss": 0.328, + "step": 2393 + }, + { + "epoch": 0.93, + "grad_norm": 3.3420411567836426, + "learning_rate": 1.439105877423963e-07, + "loss": 0.331, + "step": 2394 + }, + { + "epoch": 0.93, + "grad_norm": 3.5829924976567007, + "learning_rate": 1.424225621025993e-07, + "loss": 0.2863, + "step": 2395 + }, + { + "epoch": 0.93, + "grad_norm": 3.5257665888028784, + "learning_rate": 1.4094215838229176e-07, + "loss": 0.1107, + "step": 2396 + }, + { + "epoch": 0.93, + "grad_norm": 3.6649501246247334, + "learning_rate": 1.3946937890434276e-07, + "loss": 0.3173, + "step": 2397 + }, + { + "epoch": 0.93, + "grad_norm": 4.444779951760428, + "learning_rate": 1.3800422597965935e-07, + "loss": 0.4135, + "step": 2398 + }, + { + "epoch": 0.93, + "grad_norm": 3.990551353390603, + "learning_rate": 1.3654670190718035e-07, + "loss": 0.1215, + "step": 2399 + }, + { + "epoch": 0.93, + "grad_norm": 3.3901286357647016, + "learning_rate": 1.350968089738758e-07, + "loss": 0.1025, + "step": 2400 + }, + { + "epoch": 0.93, + "grad_norm": 4.888378452337556, + "learning_rate": 1.3365454945474088e-07, + "loss": 0.4471, + "step": 2401 + }, + { + "epoch": 0.93, + "grad_norm": 3.942309701473477, + "learning_rate": 1.322199256127943e-07, + "loss": 0.4582, + "step": 2402 + }, + { + "epoch": 0.93, + "grad_norm": 4.572422844933958, + "learning_rate": 1.3079293969907259e-07, + "loss": 0.337, + "step": 2403 + }, + { + "epoch": 0.93, + "grad_norm": 4.299307252527783, + "learning_rate": 1.293735939526286e-07, + "loss": 0.1868, + "step": 2404 + }, + { + "epoch": 0.93, + "grad_norm": 3.9020506965455546, + "learning_rate": 1.279618906005281e-07, + "loss": 0.2337, + "step": 2405 + }, + { + "epoch": 0.93, + "grad_norm": 3.9987913798648007, + "learning_rate": 1.2655783185784253e-07, + "loss": 0.3397, + "step": 2406 + }, + { + "epoch": 0.93, + "grad_norm": 4.697259423572701, + "learning_rate": 1.2516141992765074e-07, + "loss": 0.4186, + "step": 2407 + }, + { + "epoch": 0.93, + "grad_norm": 3.6738815340929847, + "learning_rate": 1.2377265700103225e-07, + "loss": 0.2155, + "step": 2408 + }, + { + "epoch": 0.93, + "grad_norm": 3.590619004433068, + "learning_rate": 1.223915452570651e-07, + "loss": 0.1545, + "step": 2409 + }, + { + "epoch": 0.93, + "grad_norm": 4.898850891496661, + "learning_rate": 1.210180868628219e-07, + "loss": 0.2885, + "step": 2410 + }, + { + "epoch": 0.93, + "grad_norm": 5.5973904171805255, + "learning_rate": 1.196522839733666e-07, + "loss": 0.5292, + "step": 2411 + }, + { + "epoch": 0.93, + "grad_norm": 5.26431185935259, + "learning_rate": 1.1829413873174988e-07, + "loss": 0.4353, + "step": 2412 + }, + { + "epoch": 0.93, + "grad_norm": 4.371513798825584, + "learning_rate": 1.1694365326900936e-07, + "loss": 0.1542, + "step": 2413 + }, + { + "epoch": 0.93, + "grad_norm": 3.806828363526848, + "learning_rate": 1.1560082970416164e-07, + "loss": 0.3467, + "step": 2414 + }, + { + "epoch": 0.93, + "grad_norm": 3.5213477132783852, + "learning_rate": 1.1426567014420297e-07, + "loss": 0.3744, + "step": 2415 + }, + { + "epoch": 0.93, + "grad_norm": 3.090420360408503, + "learning_rate": 1.1293817668410201e-07, + "loss": 0.2608, + "step": 2416 + }, + { + "epoch": 0.93, + "grad_norm": 4.183683274675307, + "learning_rate": 1.1161835140680033e-07, + "loss": 0.2705, + "step": 2417 + }, + { + "epoch": 0.93, + "grad_norm": 4.716722040421792, + "learning_rate": 1.1030619638320805e-07, + "loss": 0.1356, + "step": 2418 + }, + { + "epoch": 0.94, + "grad_norm": 4.024852555730413, + "learning_rate": 1.0900171367219826e-07, + "loss": 0.2136, + "step": 2419 + }, + { + "epoch": 0.94, + "grad_norm": 4.375596574605777, + "learning_rate": 1.0770490532060696e-07, + "loss": 0.1455, + "step": 2420 + }, + { + "epoch": 0.94, + "grad_norm": 3.1324080617824577, + "learning_rate": 1.0641577336322761e-07, + "loss": 0.1473, + "step": 2421 + }, + { + "epoch": 0.94, + "grad_norm": 4.300359247540792, + "learning_rate": 1.0513431982280997e-07, + "loss": 0.5602, + "step": 2422 + }, + { + "epoch": 0.94, + "grad_norm": 2.9591857162850492, + "learning_rate": 1.0386054671005452e-07, + "loss": 0.0584, + "step": 2423 + }, + { + "epoch": 0.94, + "grad_norm": 5.206503491737669, + "learning_rate": 1.0259445602361084e-07, + "loss": 0.4306, + "step": 2424 + }, + { + "epoch": 0.94, + "grad_norm": 4.846524563135461, + "learning_rate": 1.0133604975007483e-07, + "loss": 0.241, + "step": 2425 + }, + { + "epoch": 0.94, + "grad_norm": 5.04315192848501, + "learning_rate": 1.0008532986398422e-07, + "loss": 0.1092, + "step": 2426 + }, + { + "epoch": 0.94, + "grad_norm": 4.62176548047698, + "learning_rate": 9.884229832781644e-08, + "loss": 0.4157, + "step": 2427 + }, + { + "epoch": 0.94, + "grad_norm": 5.411617320395682, + "learning_rate": 9.760695709198575e-08, + "loss": 0.1242, + "step": 2428 + }, + { + "epoch": 0.94, + "grad_norm": 4.147046863685668, + "learning_rate": 9.637930809483942e-08, + "loss": 0.4138, + "step": 2429 + }, + { + "epoch": 0.94, + "grad_norm": 5.8360773978835905, + "learning_rate": 9.51593532626538e-08, + "loss": 0.3258, + "step": 2430 + }, + { + "epoch": 0.94, + "grad_norm": 4.819721248087985, + "learning_rate": 9.39470945096349e-08, + "loss": 0.1967, + "step": 2431 + }, + { + "epoch": 0.94, + "grad_norm": 2.946631832138262, + "learning_rate": 9.274253373791064e-08, + "loss": 0.3869, + "step": 2432 + }, + { + "epoch": 0.94, + "grad_norm": 5.5429212520894335, + "learning_rate": 9.154567283753135e-08, + "loss": 0.1746, + "step": 2433 + }, + { + "epoch": 0.94, + "grad_norm": 3.8105027567396035, + "learning_rate": 9.035651368646647e-08, + "loss": 0.3425, + "step": 2434 + }, + { + "epoch": 0.94, + "grad_norm": 4.869488263166803, + "learning_rate": 8.917505815059902e-08, + "loss": 0.1844, + "step": 2435 + }, + { + "epoch": 0.94, + "grad_norm": 5.178460812695998, + "learning_rate": 8.800130808372553e-08, + "loss": 0.5737, + "step": 2436 + }, + { + "epoch": 0.94, + "grad_norm": 4.099213248905488, + "learning_rate": 8.683526532755171e-08, + "loss": 0.3112, + "step": 2437 + }, + { + "epoch": 0.94, + "grad_norm": 3.4551360585327795, + "learning_rate": 8.567693171168956e-08, + "loss": 0.2497, + "step": 2438 + }, + { + "epoch": 0.94, + "grad_norm": 4.149425301022077, + "learning_rate": 8.452630905365633e-08, + "loss": 0.2785, + "step": 2439 + }, + { + "epoch": 0.94, + "grad_norm": 4.857104396726367, + "learning_rate": 8.338339915886784e-08, + "loss": 0.2252, + "step": 2440 + }, + { + "epoch": 0.94, + "grad_norm": 3.267349455333625, + "learning_rate": 8.224820382064013e-08, + "loss": 0.3577, + "step": 2441 + }, + { + "epoch": 0.94, + "grad_norm": 4.846666449441902, + "learning_rate": 8.11207248201834e-08, + "loss": 0.2175, + "step": 2442 + }, + { + "epoch": 0.94, + "grad_norm": 3.739712327070508, + "learning_rate": 8.000096392660029e-08, + "loss": 0.2764, + "step": 2443 + }, + { + "epoch": 0.94, + "grad_norm": 4.680967064212557, + "learning_rate": 7.888892289688366e-08, + "loss": 0.3105, + "step": 2444 + }, + { + "epoch": 0.95, + "grad_norm": 6.497417055358871, + "learning_rate": 7.778460347591277e-08, + "loss": 0.3753, + "step": 2445 + }, + { + "epoch": 0.95, + "grad_norm": 5.134835352692148, + "learning_rate": 7.6688007396451e-08, + "loss": 0.3449, + "step": 2446 + }, + { + "epoch": 0.95, + "grad_norm": 4.954947804012956, + "learning_rate": 7.559913637914418e-08, + "loss": 0.225, + "step": 2447 + }, + { + "epoch": 0.95, + "grad_norm": 4.680352421863636, + "learning_rate": 7.45179921325162e-08, + "loss": 0.3417, + "step": 2448 + }, + { + "epoch": 0.95, + "grad_norm": 4.549047612322732, + "learning_rate": 7.34445763529662e-08, + "loss": 0.2182, + "step": 2449 + }, + { + "epoch": 0.95, + "grad_norm": 6.698221199963706, + "learning_rate": 7.237889072476856e-08, + "loss": 0.2427, + "step": 2450 + }, + { + "epoch": 0.95, + "grad_norm": 4.2043257289544425, + "learning_rate": 7.13209369200668e-08, + "loss": 0.422, + "step": 2451 + }, + { + "epoch": 0.95, + "grad_norm": 5.506784247214024, + "learning_rate": 7.027071659887364e-08, + "loss": 0.6956, + "step": 2452 + }, + { + "epoch": 0.95, + "grad_norm": 4.5338379473216435, + "learning_rate": 6.922823140906754e-08, + "loss": 0.39, + "step": 2453 + }, + { + "epoch": 0.95, + "grad_norm": 3.4635245942883532, + "learning_rate": 6.819348298638839e-08, + "loss": 0.2996, + "step": 2454 + }, + { + "epoch": 0.95, + "grad_norm": 4.628592924851369, + "learning_rate": 6.716647295443801e-08, + "loss": 0.2383, + "step": 2455 + }, + { + "epoch": 0.95, + "grad_norm": 5.346329942877061, + "learning_rate": 6.614720292467569e-08, + "loss": 0.4178, + "step": 2456 + }, + { + "epoch": 0.95, + "grad_norm": 5.454209308730887, + "learning_rate": 6.5135674496416e-08, + "loss": 0.4222, + "step": 2457 + }, + { + "epoch": 0.95, + "grad_norm": 5.897875667018516, + "learning_rate": 6.4131889256826e-08, + "loss": 0.411, + "step": 2458 + }, + { + "epoch": 0.95, + "grad_norm": 2.920827659381575, + "learning_rate": 6.313584878092361e-08, + "loss": 0.2343, + "step": 2459 + }, + { + "epoch": 0.95, + "grad_norm": 3.6797078298491557, + "learning_rate": 6.214755463157417e-08, + "loss": 0.3456, + "step": 2460 + }, + { + "epoch": 0.95, + "grad_norm": 3.7075003504881185, + "learning_rate": 6.116700835948842e-08, + "loss": 0.4986, + "step": 2461 + }, + { + "epoch": 0.95, + "grad_norm": 3.9655977972924346, + "learning_rate": 6.019421150322114e-08, + "loss": 0.3024, + "step": 2462 + }, + { + "epoch": 0.95, + "grad_norm": 3.809475860857254, + "learning_rate": 5.922916558916581e-08, + "loss": 0.1581, + "step": 2463 + }, + { + "epoch": 0.95, + "grad_norm": 6.772941070339243, + "learning_rate": 5.8271872131555605e-08, + "loss": 0.1239, + "step": 2464 + }, + { + "epoch": 0.95, + "grad_norm": 4.201401866483458, + "learning_rate": 5.7322332632458454e-08, + "loss": 0.2997, + "step": 2465 + }, + { + "epoch": 0.95, + "grad_norm": 2.6027730570681804, + "learning_rate": 5.638054858177644e-08, + "loss": 0.0951, + "step": 2466 + }, + { + "epoch": 0.95, + "grad_norm": 4.533120472850452, + "learning_rate": 5.544652145724305e-08, + "loss": 0.2793, + "step": 2467 + }, + { + "epoch": 0.95, + "grad_norm": 3.8829477134747488, + "learning_rate": 5.45202527244193e-08, + "loss": 0.2008, + "step": 2468 + }, + { + "epoch": 0.95, + "grad_norm": 4.782765987982104, + "learning_rate": 5.36017438366937e-08, + "loss": 0.2675, + "step": 2469 + }, + { + "epoch": 0.95, + "grad_norm": 2.1394937379714922, + "learning_rate": 5.26909962352784e-08, + "loss": 0.2646, + "step": 2470 + }, + { + "epoch": 0.96, + "grad_norm": 4.071604895324785, + "learning_rate": 5.178801134920808e-08, + "loss": 0.1296, + "step": 2471 + }, + { + "epoch": 0.96, + "grad_norm": 5.046037995794092, + "learning_rate": 5.089279059533658e-08, + "loss": 0.5076, + "step": 2472 + }, + { + "epoch": 0.96, + "grad_norm": 4.989306541734293, + "learning_rate": 5.0005335378335294e-08, + "loss": 0.4052, + "step": 2473 + }, + { + "epoch": 0.96, + "grad_norm": 4.108135623557452, + "learning_rate": 4.91256470906909e-08, + "loss": 0.4083, + "step": 2474 + }, + { + "epoch": 0.96, + "grad_norm": 6.125310792739269, + "learning_rate": 4.825372711270371e-08, + "loss": 0.2845, + "step": 2475 + }, + { + "epoch": 0.96, + "grad_norm": 5.218625236456331, + "learning_rate": 4.73895768124838e-08, + "loss": 0.382, + "step": 2476 + }, + { + "epoch": 0.96, + "grad_norm": 4.506600050375412, + "learning_rate": 4.653319754595098e-08, + "loss": 0.4095, + "step": 2477 + }, + { + "epoch": 0.96, + "grad_norm": 3.462363796117681, + "learning_rate": 4.568459065683206e-08, + "loss": 0.1888, + "step": 2478 + }, + { + "epoch": 0.96, + "grad_norm": 3.1031074445540283, + "learning_rate": 4.484375747665637e-08, + "loss": 0.2608, + "step": 2479 + }, + { + "epoch": 0.96, + "grad_norm": 3.9194260512973553, + "learning_rate": 4.401069932475799e-08, + "loss": 0.3155, + "step": 2480 + }, + { + "epoch": 0.96, + "grad_norm": 2.9076216174540166, + "learning_rate": 4.318541750827021e-08, + "loss": 0.065, + "step": 2481 + }, + { + "epoch": 0.96, + "grad_norm": 3.0033303580822226, + "learning_rate": 4.236791332212498e-08, + "loss": 0.4254, + "step": 2482 + }, + { + "epoch": 0.96, + "grad_norm": 4.026903988839854, + "learning_rate": 4.155818804904954e-08, + "loss": 0.107, + "step": 2483 + }, + { + "epoch": 0.96, + "grad_norm": 5.839210054593576, + "learning_rate": 4.0756242959567596e-08, + "loss": 0.5627, + "step": 2484 + }, + { + "epoch": 0.96, + "grad_norm": 5.956139091148291, + "learning_rate": 3.996207931199314e-08, + "loss": 0.3366, + "step": 2485 + }, + { + "epoch": 0.96, + "grad_norm": 6.582273864155257, + "learning_rate": 3.917569835243107e-08, + "loss": 0.2606, + "step": 2486 + }, + { + "epoch": 0.96, + "grad_norm": 3.8876436065674, + "learning_rate": 3.839710131477492e-08, + "loss": 0.1378, + "step": 2487 + }, + { + "epoch": 0.96, + "grad_norm": 4.563414680610009, + "learning_rate": 3.762628942070412e-08, + "loss": 0.192, + "step": 2488 + }, + { + "epoch": 0.96, + "grad_norm": 4.570767032609257, + "learning_rate": 3.686326387968286e-08, + "loss": 0.1846, + "step": 2489 + }, + { + "epoch": 0.96, + "grad_norm": 4.681878056667321, + "learning_rate": 3.610802588895845e-08, + "loss": 0.176, + "step": 2490 + }, + { + "epoch": 0.96, + "grad_norm": 3.7111296544316255, + "learning_rate": 3.536057663355852e-08, + "loss": 0.246, + "step": 2491 + }, + { + "epoch": 0.96, + "grad_norm": 4.429892003758981, + "learning_rate": 3.462091728628936e-08, + "loss": 0.5231, + "step": 2492 + }, + { + "epoch": 0.96, + "grad_norm": 3.7128950589879866, + "learning_rate": 3.388904900773371e-08, + "loss": 0.3555, + "step": 2493 + }, + { + "epoch": 0.96, + "grad_norm": 5.793644159392157, + "learning_rate": 3.316497294625132e-08, + "loss": 0.6867, + "step": 2494 + }, + { + "epoch": 0.96, + "grad_norm": 4.180988923079452, + "learning_rate": 3.2448690237973366e-08, + "loss": 0.4084, + "step": 2495 + }, + { + "epoch": 0.96, + "grad_norm": 7.287777924413648, + "learning_rate": 3.1740202006804166e-08, + "loss": 0.2953, + "step": 2496 + }, + { + "epoch": 0.97, + "grad_norm": 3.329746316314543, + "learning_rate": 3.1039509364416686e-08, + "loss": 0.3636, + "step": 2497 + }, + { + "epoch": 0.97, + "grad_norm": 4.338994924839404, + "learning_rate": 3.034661341025258e-08, + "loss": 0.3093, + "step": 2498 + }, + { + "epoch": 0.97, + "grad_norm": 3.9125712203770338, + "learning_rate": 2.96615152315205e-08, + "loss": 0.3362, + "step": 2499 + }, + { + "epoch": 0.97, + "grad_norm": 4.353841186113316, + "learning_rate": 2.898421590319278e-08, + "loss": 0.4441, + "step": 2500 + }, + { + "epoch": 0.97, + "grad_norm": 3.726307880994095, + "learning_rate": 2.8314716488004878e-08, + "loss": 0.3498, + "step": 2501 + }, + { + "epoch": 0.97, + "grad_norm": 4.993050474232988, + "learning_rate": 2.765301803645426e-08, + "loss": 0.2487, + "step": 2502 + }, + { + "epoch": 0.97, + "grad_norm": 3.065567782858954, + "learning_rate": 2.6999121586797626e-08, + "loss": 0.3005, + "step": 2503 + }, + { + "epoch": 0.97, + "grad_norm": 4.870193447538301, + "learning_rate": 2.6353028165049254e-08, + "loss": 0.315, + "step": 2504 + }, + { + "epoch": 0.97, + "grad_norm": 4.270123183126187, + "learning_rate": 2.5714738784980986e-08, + "loss": 0.1365, + "step": 2505 + }, + { + "epoch": 0.97, + "grad_norm": 4.571330197561437, + "learning_rate": 2.5084254448117794e-08, + "loss": 0.3508, + "step": 2506 + }, + { + "epoch": 0.97, + "grad_norm": 5.005644196773933, + "learning_rate": 2.4461576143740007e-08, + "loss": 0.5137, + "step": 2507 + }, + { + "epoch": 0.97, + "grad_norm": 4.374905997459383, + "learning_rate": 2.3846704848878298e-08, + "loss": 0.3205, + "step": 2508 + }, + { + "epoch": 0.97, + "grad_norm": 5.337172424869481, + "learning_rate": 2.323964152831426e-08, + "loss": 0.2395, + "step": 2509 + }, + { + "epoch": 0.97, + "grad_norm": 3.6707544297850334, + "learning_rate": 2.264038713457706e-08, + "loss": 0.1153, + "step": 2510 + }, + { + "epoch": 0.97, + "grad_norm": 4.921124694953214, + "learning_rate": 2.204894260794399e-08, + "loss": 0.2637, + "step": 2511 + }, + { + "epoch": 0.97, + "grad_norm": 2.6874069579673088, + "learning_rate": 2.1465308876438275e-08, + "loss": 0.2582, + "step": 2512 + }, + { + "epoch": 0.97, + "grad_norm": 3.7975784533444275, + "learning_rate": 2.0889486855826812e-08, + "loss": 0.2238, + "step": 2513 + }, + { + "epoch": 0.97, + "grad_norm": 4.311751711747407, + "learning_rate": 2.0321477449619098e-08, + "loss": 0.3692, + "step": 2514 + }, + { + "epoch": 0.97, + "grad_norm": 4.945762699148563, + "learning_rate": 1.9761281549067202e-08, + "loss": 0.3589, + "step": 2515 + }, + { + "epoch": 0.97, + "grad_norm": 3.049157802613744, + "learning_rate": 1.9208900033161338e-08, + "loss": 0.06, + "step": 2516 + }, + { + "epoch": 0.97, + "grad_norm": 2.7649032589146585, + "learning_rate": 1.8664333768631526e-08, + "loss": 0.0726, + "step": 2517 + }, + { + "epoch": 0.97, + "grad_norm": 3.3304364903166617, + "learning_rate": 1.8127583609945376e-08, + "loss": 0.4464, + "step": 2518 + }, + { + "epoch": 0.97, + "grad_norm": 4.564667529502882, + "learning_rate": 1.7598650399305862e-08, + "loss": 0.1721, + "step": 2519 + }, + { + "epoch": 0.97, + "grad_norm": 4.428294691529022, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.3292, + "step": 2520 + }, + { + "epoch": 0.97, + "grad_norm": 4.458748330778354, + "learning_rate": 1.6564238129650468e-08, + "loss": 0.1685, + "step": 2521 + }, + { + "epoch": 0.98, + "grad_norm": 3.871700938049787, + "learning_rate": 1.6058760693708487e-08, + "loss": 0.3076, + "step": 2522 + }, + { + "epoch": 0.98, + "grad_norm": 3.5337468242527925, + "learning_rate": 1.556110345195816e-08, + "loss": 0.4248, + "step": 2523 + }, + { + "epoch": 0.98, + "grad_norm": 3.322905065071024, + "learning_rate": 1.5071267185262086e-08, + "loss": 0.1864, + "step": 2524 + }, + { + "epoch": 0.98, + "grad_norm": 4.504052845237909, + "learning_rate": 1.4589252662213227e-08, + "loss": 0.3098, + "step": 2525 + }, + { + "epoch": 0.98, + "grad_norm": 5.479172864918825, + "learning_rate": 1.411506063912882e-08, + "loss": 0.3844, + "step": 2526 + }, + { + "epoch": 0.98, + "grad_norm": 4.091046952414351, + "learning_rate": 1.3648691860053686e-08, + "loss": 0.1502, + "step": 2527 + }, + { + "epoch": 0.98, + "grad_norm": 5.0671040189744, + "learning_rate": 1.3190147056757474e-08, + "loss": 0.3661, + "step": 2528 + }, + { + "epoch": 0.98, + "grad_norm": 4.367494494280107, + "learning_rate": 1.2739426948732426e-08, + "loss": 0.2815, + "step": 2529 + }, + { + "epoch": 0.98, + "grad_norm": 5.5721072977285555, + "learning_rate": 1.2296532243193382e-08, + "loss": 0.3263, + "step": 2530 + }, + { + "epoch": 0.98, + "grad_norm": 5.522497550698559, + "learning_rate": 1.1861463635077785e-08, + "loss": 0.3777, + "step": 2531 + }, + { + "epoch": 0.98, + "grad_norm": 3.460847448476461, + "learning_rate": 1.1434221807041234e-08, + "loss": 0.2931, + "step": 2532 + }, + { + "epoch": 0.98, + "grad_norm": 3.0025198698598334, + "learning_rate": 1.1014807429460262e-08, + "loss": 0.1078, + "step": 2533 + }, + { + "epoch": 0.98, + "grad_norm": 3.657526819003815, + "learning_rate": 1.0603221160429e-08, + "loss": 0.3756, + "step": 2534 + }, + { + "epoch": 0.98, + "grad_norm": 3.4038560085940524, + "learning_rate": 1.019946364575808e-08, + "loss": 0.1278, + "step": 2535 + }, + { + "epoch": 0.98, + "grad_norm": 4.22480764244677, + "learning_rate": 9.803535518975171e-09, + "loss": 0.1412, + "step": 2536 + }, + { + "epoch": 0.98, + "grad_norm": 4.147722850177204, + "learning_rate": 9.41543740132167e-09, + "loss": 0.3178, + "step": 2537 + }, + { + "epoch": 0.98, + "grad_norm": 5.132280351434606, + "learning_rate": 9.035169901754902e-09, + "loss": 0.3441, + "step": 2538 + }, + { + "epoch": 0.98, + "grad_norm": 6.055217266405239, + "learning_rate": 8.662733616944253e-09, + "loss": 0.4877, + "step": 2539 + }, + { + "epoch": 0.98, + "grad_norm": 3.9514117165189906, + "learning_rate": 8.298129131270594e-09, + "loss": 0.2908, + "step": 2540 + }, + { + "epoch": 0.98, + "grad_norm": 4.862555042803328, + "learning_rate": 7.941357016827967e-09, + "loss": 0.3591, + "step": 2541 + }, + { + "epoch": 0.98, + "grad_norm": 3.727874665401667, + "learning_rate": 7.59241783341913e-09, + "loss": 0.3701, + "step": 2542 + }, + { + "epoch": 0.98, + "grad_norm": 4.120515532998743, + "learning_rate": 7.251312128556675e-09, + "loss": 0.3953, + "step": 2543 + }, + { + "epoch": 0.98, + "grad_norm": 3.7310449788079407, + "learning_rate": 6.918040437463025e-09, + "loss": 0.29, + "step": 2544 + }, + { + "epoch": 0.98, + "grad_norm": 4.307580188884742, + "learning_rate": 6.592603283067101e-09, + "loss": 0.1891, + "step": 2545 + }, + { + "epoch": 0.98, + "grad_norm": 4.009717756133829, + "learning_rate": 6.2750011760054355e-09, + "loss": 0.15, + "step": 2546 + }, + { + "epoch": 0.98, + "grad_norm": 3.801223121652817, + "learning_rate": 5.965234614620508e-09, + "loss": 0.2152, + "step": 2547 + }, + { + "epoch": 0.99, + "grad_norm": 3.358510602122476, + "learning_rate": 5.6633040849601865e-09, + "loss": 0.1348, + "step": 2548 + }, + { + "epoch": 0.99, + "grad_norm": 3.2923506636351823, + "learning_rate": 5.369210060777175e-09, + "loss": 0.1001, + "step": 2549 + }, + { + "epoch": 0.99, + "grad_norm": 3.512827829846558, + "learning_rate": 5.082953003528457e-09, + "loss": 0.231, + "step": 2550 + }, + { + "epoch": 0.99, + "grad_norm": 5.179076900260911, + "learning_rate": 4.804533362373076e-09, + "loss": 0.4858, + "step": 2551 + }, + { + "epoch": 0.99, + "grad_norm": 4.853661165753404, + "learning_rate": 4.533951574174356e-09, + "loss": 0.1578, + "step": 2552 + }, + { + "epoch": 0.99, + "grad_norm": 4.590908201313937, + "learning_rate": 4.2712080634949024e-09, + "loss": 0.2932, + "step": 2553 + }, + { + "epoch": 0.99, + "grad_norm": 4.542632474681511, + "learning_rate": 4.016303242600495e-09, + "loss": 0.355, + "step": 2554 + }, + { + "epoch": 0.99, + "grad_norm": 3.927745661422115, + "learning_rate": 3.769237511457302e-09, + "loss": 0.3641, + "step": 2555 + }, + { + "epoch": 0.99, + "grad_norm": 3.3152458610500832, + "learning_rate": 3.530011257730226e-09, + "loss": 0.0891, + "step": 2556 + }, + { + "epoch": 0.99, + "grad_norm": 5.026774623379328, + "learning_rate": 3.298624856784005e-09, + "loss": 0.347, + "step": 2557 + }, + { + "epoch": 0.99, + "grad_norm": 4.817238944433984, + "learning_rate": 3.075078671682108e-09, + "loss": 0.3206, + "step": 2558 + }, + { + "epoch": 0.99, + "grad_norm": 4.642502782905127, + "learning_rate": 2.8593730531861764e-09, + "loss": 0.3517, + "step": 2559 + }, + { + "epoch": 0.99, + "grad_norm": 4.382834139043959, + "learning_rate": 2.6515083397549156e-09, + "loss": 0.3784, + "step": 2560 + }, + { + "epoch": 0.99, + "grad_norm": 4.029589267164337, + "learning_rate": 2.4514848575446505e-09, + "loss": 0.2407, + "step": 2561 + }, + { + "epoch": 0.99, + "grad_norm": 5.613691822112342, + "learning_rate": 2.2593029204076578e-09, + "loss": 0.4333, + "step": 2562 + }, + { + "epoch": 0.99, + "grad_norm": 3.9969353002967742, + "learning_rate": 2.0749628298921688e-09, + "loss": 0.2206, + "step": 2563 + }, + { + "epoch": 0.99, + "grad_norm": 4.782340417296486, + "learning_rate": 1.8984648752429222e-09, + "loss": 0.3964, + "step": 2564 + }, + { + "epoch": 0.99, + "grad_norm": 3.638242369175846, + "learning_rate": 1.7298093333989463e-09, + "loss": 0.0925, + "step": 2565 + }, + { + "epoch": 0.99, + "grad_norm": 3.926706281697563, + "learning_rate": 1.5689964689935555e-09, + "loss": 0.2707, + "step": 2566 + }, + { + "epoch": 0.99, + "grad_norm": 3.4660002503863585, + "learning_rate": 1.4160265343549084e-09, + "loss": 0.2439, + "step": 2567 + }, + { + "epoch": 0.99, + "grad_norm": 3.72353386822579, + "learning_rate": 1.2708997695043412e-09, + "loss": 0.082, + "step": 2568 + }, + { + "epoch": 0.99, + "grad_norm": 4.715665495542824, + "learning_rate": 1.133616402158033e-09, + "loss": 0.3755, + "step": 2569 + }, + { + "epoch": 0.99, + "grad_norm": 4.597729541155232, + "learning_rate": 1.004176647724231e-09, + "loss": 0.4678, + "step": 2570 + }, + { + "epoch": 0.99, + "grad_norm": 3.759024022744002, + "learning_rate": 8.82580709303249e-10, + "loss": 0.1463, + "step": 2571 + }, + { + "epoch": 0.99, + "grad_norm": 3.7135158349399124, + "learning_rate": 7.688287776896897e-10, + "loss": 0.3065, + "step": 2572 + }, + { + "epoch": 0.99, + "grad_norm": 2.9021686062557372, + "learning_rate": 6.629210313680023e-10, + "loss": 0.123, + "step": 2573 + }, + { + "epoch": 1.0, + "grad_norm": 3.661695973328677, + "learning_rate": 5.648576365169245e-10, + "loss": 0.2247, + "step": 2574 + }, + { + "epoch": 1.0, + "grad_norm": 4.85994131895302, + "learning_rate": 4.746387470044855e-10, + "loss": 0.1752, + "step": 2575 + }, + { + "epoch": 1.0, + "grad_norm": 4.889415920762853, + "learning_rate": 3.922645043924478e-10, + "loss": 0.379, + "step": 2576 + }, + { + "epoch": 1.0, + "grad_norm": 3.786370660553833, + "learning_rate": 3.1773503793131043e-10, + "loss": 0.3649, + "step": 2577 + }, + { + "epoch": 1.0, + "grad_norm": 5.692099772632725, + "learning_rate": 2.5105046456475047e-10, + "loss": 0.3483, + "step": 2578 + }, + { + "epoch": 1.0, + "grad_norm": 3.9958814936777114, + "learning_rate": 1.9221088892518169e-10, + "loss": 0.4529, + "step": 2579 + }, + { + "epoch": 1.0, + "grad_norm": 5.821969571566754, + "learning_rate": 1.4121640333653042e-10, + "loss": 0.4975, + "step": 2580 + }, + { + "epoch": 1.0, + "grad_norm": 4.220564095926847, + "learning_rate": 9.806708781368025e-11, + "loss": 0.3491, + "step": 2581 + }, + { + "epoch": 1.0, + "grad_norm": 4.874333232465943, + "learning_rate": 6.276301006080676e-11, + "loss": 0.2107, + "step": 2582 + }, + { + "epoch": 1.0, + "grad_norm": 5.085812035622458, + "learning_rate": 3.530422547304291e-11, + "loss": 0.2492, + "step": 2583 + }, + { + "epoch": 1.0, + "grad_norm": 5.090912860808705, + "learning_rate": 1.5690777135368707e-11, + "loss": 0.2154, + "step": 2584 + }, + { + "epoch": 1.0, + "grad_norm": 3.693481016317709, + "learning_rate": 3.922695822611289e-12, + "loss": 0.2986, + "step": 2585 + }, + { + "epoch": 1.0, + "grad_norm": 7.069385969005949, + "learning_rate": 0.0, + "loss": 0.5263, + "step": 2586 + }, + { + "epoch": 1.0, + "step": 2586, + "total_flos": 0.0, + "train_loss": 0.5355383884464229, + "train_runtime": 62473.0256, + "train_samples_per_second": 10.599, + "train_steps_per_second": 0.041 + } + ], + "logging_steps": 1.0, + "max_steps": 2586, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}