{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996199941537562, "eval_steps": 500, "global_step": 639, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00467699503069278, "grad_norm": 0.5310407876968384, "learning_rate": 9.98435054773083e-06, "loss": 15.7421, "step": 1 }, { "epoch": 0.00935399006138556, "grad_norm": 0.6928378343582153, "learning_rate": 9.96870109546166e-06, "loss": 22.4837, "step": 2 }, { "epoch": 0.014030985092078339, "grad_norm": 0.5985817909240723, "learning_rate": 9.953051643192489e-06, "loss": 19.2438, "step": 3 }, { "epoch": 0.01870798012277112, "grad_norm": 0.3373333215713501, "learning_rate": 9.937402190923318e-06, "loss": 17.8898, "step": 4 }, { "epoch": 0.0233849751534639, "grad_norm": 0.41831186413764954, "learning_rate": 9.921752738654147e-06, "loss": 17.4461, "step": 5 }, { "epoch": 0.028061970184156678, "grad_norm": 0.3748932182788849, "learning_rate": 9.906103286384977e-06, "loss": 17.7855, "step": 6 }, { "epoch": 0.03273896521484946, "grad_norm": 0.3773082494735718, "learning_rate": 9.890453834115806e-06, "loss": 16.9238, "step": 7 }, { "epoch": 0.03741596024554224, "grad_norm": 0.3397878408432007, "learning_rate": 9.874804381846637e-06, "loss": 16.7176, "step": 8 }, { "epoch": 0.04209295527623502, "grad_norm": 0.5136957168579102, "learning_rate": 9.859154929577466e-06, "loss": 15.4666, "step": 9 }, { "epoch": 0.0467699503069278, "grad_norm": 0.3085887134075165, "learning_rate": 9.843505477308296e-06, "loss": 13.7508, "step": 10 }, { "epoch": 0.05144694533762058, "grad_norm": 0.3942926228046417, "learning_rate": 9.827856025039125e-06, "loss": 17.3618, "step": 11 }, { "epoch": 0.056123940368313356, "grad_norm": 0.27711015939712524, "learning_rate": 9.812206572769954e-06, "loss": 16.2495, "step": 12 }, { "epoch": 0.06080093539900614, "grad_norm": 0.27834147214889526, "learning_rate": 9.796557120500783e-06, "loss": 15.9543, "step": 13 }, { "epoch": 0.06547793042969892, "grad_norm": 0.24677161872386932, "learning_rate": 9.780907668231613e-06, "loss": 15.6386, "step": 14 }, { "epoch": 0.0701549254603917, "grad_norm": 0.36406269669532776, "learning_rate": 9.765258215962442e-06, "loss": 15.6255, "step": 15 }, { "epoch": 0.07483192049108447, "grad_norm": 0.307948499917984, "learning_rate": 9.749608763693271e-06, "loss": 14.9189, "step": 16 }, { "epoch": 0.07950891552177726, "grad_norm": 0.2980886697769165, "learning_rate": 9.7339593114241e-06, "loss": 15.0406, "step": 17 }, { "epoch": 0.08418591055247004, "grad_norm": 0.412708044052124, "learning_rate": 9.71830985915493e-06, "loss": 14.7356, "step": 18 }, { "epoch": 0.08886290558316282, "grad_norm": 0.2903729975223541, "learning_rate": 9.70266040688576e-06, "loss": 14.8173, "step": 19 }, { "epoch": 0.0935399006138556, "grad_norm": 0.2171318084001541, "learning_rate": 9.687010954616589e-06, "loss": 15.0565, "step": 20 }, { "epoch": 0.09821689564454837, "grad_norm": 0.4166527986526489, "learning_rate": 9.671361502347418e-06, "loss": 14.1942, "step": 21 }, { "epoch": 0.10289389067524116, "grad_norm": 0.2564053237438202, "learning_rate": 9.655712050078247e-06, "loss": 14.3627, "step": 22 }, { "epoch": 0.10757088570593394, "grad_norm": 0.254341185092926, "learning_rate": 9.640062597809078e-06, "loss": 13.7315, "step": 23 }, { "epoch": 0.11224788073662671, "grad_norm": 0.24184982478618622, "learning_rate": 9.624413145539908e-06, "loss": 13.9785, "step": 24 }, { "epoch": 0.1169248757673195, "grad_norm": 0.3118051588535309, "learning_rate": 9.608763693270737e-06, "loss": 15.1744, "step": 25 }, { "epoch": 0.12160187079801228, "grad_norm": 0.2545301020145416, "learning_rate": 9.593114241001566e-06, "loss": 15.9676, "step": 26 }, { "epoch": 0.12627886582870507, "grad_norm": 0.2265356183052063, "learning_rate": 9.577464788732394e-06, "loss": 14.8985, "step": 27 }, { "epoch": 0.13095586085939784, "grad_norm": 0.2141331285238266, "learning_rate": 9.561815336463225e-06, "loss": 13.6498, "step": 28 }, { "epoch": 0.1356328558900906, "grad_norm": 0.27572301030158997, "learning_rate": 9.546165884194054e-06, "loss": 13.9124, "step": 29 }, { "epoch": 0.1403098509207834, "grad_norm": 0.1987282633781433, "learning_rate": 9.530516431924883e-06, "loss": 12.9095, "step": 30 }, { "epoch": 0.14498684595147618, "grad_norm": 0.2444925159215927, "learning_rate": 9.514866979655713e-06, "loss": 13.365, "step": 31 }, { "epoch": 0.14966384098216895, "grad_norm": 0.4400818645954132, "learning_rate": 9.499217527386542e-06, "loss": 13.2832, "step": 32 }, { "epoch": 0.15434083601286175, "grad_norm": 0.2764039933681488, "learning_rate": 9.483568075117371e-06, "loss": 14.3228, "step": 33 }, { "epoch": 0.15901783104355452, "grad_norm": 0.21101799607276917, "learning_rate": 9.4679186228482e-06, "loss": 14.156, "step": 34 }, { "epoch": 0.1636948260742473, "grad_norm": 0.267008513212204, "learning_rate": 9.45226917057903e-06, "loss": 14.1084, "step": 35 }, { "epoch": 0.1683718211049401, "grad_norm": 0.2759203016757965, "learning_rate": 9.43661971830986e-06, "loss": 13.5008, "step": 36 }, { "epoch": 0.17304881613563286, "grad_norm": 0.2793346643447876, "learning_rate": 9.42097026604069e-06, "loss": 14.2747, "step": 37 }, { "epoch": 0.17772581116632563, "grad_norm": 0.25120246410369873, "learning_rate": 9.40532081377152e-06, "loss": 13.4347, "step": 38 }, { "epoch": 0.1824028061970184, "grad_norm": 0.1591794341802597, "learning_rate": 9.389671361502349e-06, "loss": 12.619, "step": 39 }, { "epoch": 0.1870798012277112, "grad_norm": 0.2054363638162613, "learning_rate": 9.374021909233178e-06, "loss": 11.8876, "step": 40 }, { "epoch": 0.19175679625840397, "grad_norm": 0.23818843066692352, "learning_rate": 9.358372456964007e-06, "loss": 13.4683, "step": 41 }, { "epoch": 0.19643379128909674, "grad_norm": 0.32269319891929626, "learning_rate": 9.342723004694837e-06, "loss": 12.5351, "step": 42 }, { "epoch": 0.20111078631978954, "grad_norm": 0.29193466901779175, "learning_rate": 9.327073552425666e-06, "loss": 11.917, "step": 43 }, { "epoch": 0.2057877813504823, "grad_norm": 0.20844891667366028, "learning_rate": 9.311424100156495e-06, "loss": 12.0984, "step": 44 }, { "epoch": 0.21046477638117508, "grad_norm": 0.26920032501220703, "learning_rate": 9.295774647887325e-06, "loss": 14.1542, "step": 45 }, { "epoch": 0.21514177141186788, "grad_norm": 0.20874425768852234, "learning_rate": 9.280125195618154e-06, "loss": 13.9397, "step": 46 }, { "epoch": 0.21981876644256065, "grad_norm": 0.28703245520591736, "learning_rate": 9.264475743348983e-06, "loss": 12.7704, "step": 47 }, { "epoch": 0.22449576147325342, "grad_norm": 0.23402653634548187, "learning_rate": 9.248826291079813e-06, "loss": 12.8326, "step": 48 }, { "epoch": 0.22917275650394622, "grad_norm": 0.28065574169158936, "learning_rate": 9.233176838810642e-06, "loss": 11.4735, "step": 49 }, { "epoch": 0.233849751534639, "grad_norm": 0.21932877600193024, "learning_rate": 9.217527386541471e-06, "loss": 12.2491, "step": 50 }, { "epoch": 0.23852674656533177, "grad_norm": 0.24466539919376373, "learning_rate": 9.2018779342723e-06, "loss": 12.3501, "step": 51 }, { "epoch": 0.24320374159602456, "grad_norm": 0.17424331605434418, "learning_rate": 9.186228482003131e-06, "loss": 12.6445, "step": 52 }, { "epoch": 0.24788073662671734, "grad_norm": 0.2298133671283722, "learning_rate": 9.17057902973396e-06, "loss": 12.5759, "step": 53 }, { "epoch": 0.25255773165741013, "grad_norm": 0.30562305450439453, "learning_rate": 9.15492957746479e-06, "loss": 13.4988, "step": 54 }, { "epoch": 0.2572347266881029, "grad_norm": 0.21225547790527344, "learning_rate": 9.13928012519562e-06, "loss": 13.3909, "step": 55 }, { "epoch": 0.2619117217187957, "grad_norm": 0.3120986223220825, "learning_rate": 9.123630672926449e-06, "loss": 13.8276, "step": 56 }, { "epoch": 0.2665887167494885, "grad_norm": 0.18036110699176788, "learning_rate": 9.107981220657278e-06, "loss": 13.7724, "step": 57 }, { "epoch": 0.2712657117801812, "grad_norm": 0.22987115383148193, "learning_rate": 9.092331768388107e-06, "loss": 12.2669, "step": 58 }, { "epoch": 0.275942706810874, "grad_norm": 0.23878921568393707, "learning_rate": 9.076682316118937e-06, "loss": 12.7097, "step": 59 }, { "epoch": 0.2806197018415668, "grad_norm": 0.20319631695747375, "learning_rate": 9.061032863849766e-06, "loss": 12.2795, "step": 60 }, { "epoch": 0.28529669687225956, "grad_norm": 0.18609336018562317, "learning_rate": 9.045383411580595e-06, "loss": 11.2712, "step": 61 }, { "epoch": 0.28997369190295236, "grad_norm": 0.21320512890815735, "learning_rate": 9.029733959311425e-06, "loss": 11.6637, "step": 62 }, { "epoch": 0.29465068693364516, "grad_norm": 0.23330001533031464, "learning_rate": 9.014084507042254e-06, "loss": 12.9509, "step": 63 }, { "epoch": 0.2993276819643379, "grad_norm": 0.21313583850860596, "learning_rate": 8.998435054773083e-06, "loss": 13.8547, "step": 64 }, { "epoch": 0.3040046769950307, "grad_norm": 0.20739194750785828, "learning_rate": 8.982785602503912e-06, "loss": 12.735, "step": 65 }, { "epoch": 0.3086816720257235, "grad_norm": 0.2453576922416687, "learning_rate": 8.967136150234742e-06, "loss": 12.7951, "step": 66 }, { "epoch": 0.31335866705641624, "grad_norm": 0.21135878562927246, "learning_rate": 8.951486697965573e-06, "loss": 13.7611, "step": 67 }, { "epoch": 0.31803566208710904, "grad_norm": 0.2257193773984909, "learning_rate": 8.935837245696402e-06, "loss": 11.3833, "step": 68 }, { "epoch": 0.32271265711780184, "grad_norm": 0.1934535950422287, "learning_rate": 8.920187793427231e-06, "loss": 11.4428, "step": 69 }, { "epoch": 0.3273896521484946, "grad_norm": 0.19537678360939026, "learning_rate": 8.90453834115806e-06, "loss": 13.1129, "step": 70 }, { "epoch": 0.3320666471791874, "grad_norm": 0.2596362233161926, "learning_rate": 8.888888888888888e-06, "loss": 11.9323, "step": 71 }, { "epoch": 0.3367436422098802, "grad_norm": 0.28119221329689026, "learning_rate": 8.87323943661972e-06, "loss": 12.1397, "step": 72 }, { "epoch": 0.3414206372405729, "grad_norm": 0.2443932145833969, "learning_rate": 8.857589984350549e-06, "loss": 11.1756, "step": 73 }, { "epoch": 0.3460976322712657, "grad_norm": 0.23586861789226532, "learning_rate": 8.841940532081378e-06, "loss": 12.2808, "step": 74 }, { "epoch": 0.3507746273019585, "grad_norm": 0.2984711229801178, "learning_rate": 8.826291079812207e-06, "loss": 11.8437, "step": 75 }, { "epoch": 0.35545162233265126, "grad_norm": 0.2404984086751938, "learning_rate": 8.810641627543037e-06, "loss": 11.6321, "step": 76 }, { "epoch": 0.36012861736334406, "grad_norm": 0.22745920717716217, "learning_rate": 8.794992175273866e-06, "loss": 12.969, "step": 77 }, { "epoch": 0.3648056123940368, "grad_norm": 0.22989057004451752, "learning_rate": 8.779342723004695e-06, "loss": 12.1793, "step": 78 }, { "epoch": 0.3694826074247296, "grad_norm": 0.22097162902355194, "learning_rate": 8.763693270735524e-06, "loss": 12.5693, "step": 79 }, { "epoch": 0.3741596024554224, "grad_norm": 0.19985444843769073, "learning_rate": 8.748043818466354e-06, "loss": 13.3868, "step": 80 }, { "epoch": 0.37883659748611515, "grad_norm": 0.2339348942041397, "learning_rate": 8.732394366197183e-06, "loss": 11.5905, "step": 81 }, { "epoch": 0.38351359251680794, "grad_norm": 0.28241512179374695, "learning_rate": 8.716744913928014e-06, "loss": 12.6998, "step": 82 }, { "epoch": 0.38819058754750074, "grad_norm": 0.2848986089229584, "learning_rate": 8.701095461658843e-06, "loss": 11.3058, "step": 83 }, { "epoch": 0.3928675825781935, "grad_norm": 0.2118872106075287, "learning_rate": 8.685446009389673e-06, "loss": 10.4664, "step": 84 }, { "epoch": 0.3975445776088863, "grad_norm": 0.16718249022960663, "learning_rate": 8.669796557120502e-06, "loss": 13.2492, "step": 85 }, { "epoch": 0.4022215726395791, "grad_norm": 0.2131660282611847, "learning_rate": 8.65414710485133e-06, "loss": 12.0166, "step": 86 }, { "epoch": 0.4068985676702718, "grad_norm": 0.2012370079755783, "learning_rate": 8.63849765258216e-06, "loss": 12.5326, "step": 87 }, { "epoch": 0.4115755627009646, "grad_norm": 0.2684880793094635, "learning_rate": 8.62284820031299e-06, "loss": 12.8071, "step": 88 }, { "epoch": 0.4162525577316574, "grad_norm": 0.2500629127025604, "learning_rate": 8.60719874804382e-06, "loss": 12.1628, "step": 89 }, { "epoch": 0.42092955276235017, "grad_norm": 0.18125677108764648, "learning_rate": 8.591549295774648e-06, "loss": 11.3137, "step": 90 }, { "epoch": 0.42560654779304297, "grad_norm": 0.1830630898475647, "learning_rate": 8.575899843505478e-06, "loss": 11.8507, "step": 91 }, { "epoch": 0.43028354282373577, "grad_norm": 0.1481466144323349, "learning_rate": 8.560250391236307e-06, "loss": 10.8795, "step": 92 }, { "epoch": 0.4349605378544285, "grad_norm": 0.18768347799777985, "learning_rate": 8.544600938967136e-06, "loss": 11.2509, "step": 93 }, { "epoch": 0.4396375328851213, "grad_norm": 0.22724182903766632, "learning_rate": 8.528951486697966e-06, "loss": 11.6564, "step": 94 }, { "epoch": 0.4443145279158141, "grad_norm": 0.1806531399488449, "learning_rate": 8.513302034428795e-06, "loss": 11.9111, "step": 95 }, { "epoch": 0.44899152294650685, "grad_norm": 0.2578674554824829, "learning_rate": 8.497652582159626e-06, "loss": 13.1609, "step": 96 }, { "epoch": 0.45366851797719965, "grad_norm": 0.21666157245635986, "learning_rate": 8.482003129890455e-06, "loss": 12.3285, "step": 97 }, { "epoch": 0.45834551300789245, "grad_norm": 0.2574619948863983, "learning_rate": 8.466353677621285e-06, "loss": 11.4998, "step": 98 }, { "epoch": 0.4630225080385852, "grad_norm": 0.28588882088661194, "learning_rate": 8.450704225352114e-06, "loss": 11.0233, "step": 99 }, { "epoch": 0.467699503069278, "grad_norm": 0.28356659412384033, "learning_rate": 8.435054773082943e-06, "loss": 10.9355, "step": 100 }, { "epoch": 0.4723764980999708, "grad_norm": 0.18748782575130463, "learning_rate": 8.419405320813773e-06, "loss": 13.5926, "step": 101 }, { "epoch": 0.47705349313066353, "grad_norm": 0.17172126471996307, "learning_rate": 8.403755868544602e-06, "loss": 11.4017, "step": 102 }, { "epoch": 0.48173048816135633, "grad_norm": 0.1956973671913147, "learning_rate": 8.388106416275431e-06, "loss": 12.1463, "step": 103 }, { "epoch": 0.48640748319204913, "grad_norm": 0.30823975801467896, "learning_rate": 8.37245696400626e-06, "loss": 10.2949, "step": 104 }, { "epoch": 0.49108447822274187, "grad_norm": 0.23158958554267883, "learning_rate": 8.35680751173709e-06, "loss": 11.2003, "step": 105 }, { "epoch": 0.49576147325343467, "grad_norm": 0.23977261781692505, "learning_rate": 8.341158059467919e-06, "loss": 11.5904, "step": 106 }, { "epoch": 0.5004384682841274, "grad_norm": 0.17250728607177734, "learning_rate": 8.325508607198748e-06, "loss": 11.2648, "step": 107 }, { "epoch": 0.5051154633148203, "grad_norm": 0.23300261795520782, "learning_rate": 8.309859154929578e-06, "loss": 11.9646, "step": 108 }, { "epoch": 0.509792458345513, "grad_norm": 0.2430488020181656, "learning_rate": 8.294209702660407e-06, "loss": 12.046, "step": 109 }, { "epoch": 0.5144694533762058, "grad_norm": 0.18206799030303955, "learning_rate": 8.278560250391236e-06, "loss": 12.0767, "step": 110 }, { "epoch": 0.5191464484068986, "grad_norm": 0.25876322388648987, "learning_rate": 8.262910798122067e-06, "loss": 11.7794, "step": 111 }, { "epoch": 0.5238234434375914, "grad_norm": 0.28936639428138733, "learning_rate": 8.247261345852897e-06, "loss": 10.3819, "step": 112 }, { "epoch": 0.5285004384682841, "grad_norm": 0.214036762714386, "learning_rate": 8.231611893583726e-06, "loss": 10.3209, "step": 113 }, { "epoch": 0.533177433498977, "grad_norm": 0.23764470219612122, "learning_rate": 8.215962441314555e-06, "loss": 10.8417, "step": 114 }, { "epoch": 0.5378544285296697, "grad_norm": 0.2604602575302124, "learning_rate": 8.200312989045383e-06, "loss": 12.534, "step": 115 }, { "epoch": 0.5425314235603624, "grad_norm": 0.24597330391407013, "learning_rate": 8.184663536776214e-06, "loss": 12.348, "step": 116 }, { "epoch": 0.5472084185910553, "grad_norm": 0.2204928994178772, "learning_rate": 8.169014084507043e-06, "loss": 10.979, "step": 117 }, { "epoch": 0.551885413621748, "grad_norm": 0.15487593412399292, "learning_rate": 8.153364632237872e-06, "loss": 11.0756, "step": 118 }, { "epoch": 0.5565624086524408, "grad_norm": 0.23864871263504028, "learning_rate": 8.137715179968702e-06, "loss": 11.66, "step": 119 }, { "epoch": 0.5612394036831336, "grad_norm": 0.22024200856685638, "learning_rate": 8.122065727699531e-06, "loss": 10.7713, "step": 120 }, { "epoch": 0.5659163987138264, "grad_norm": 0.19292014837265015, "learning_rate": 8.10641627543036e-06, "loss": 9.4704, "step": 121 }, { "epoch": 0.5705933937445191, "grad_norm": 0.16765080392360687, "learning_rate": 8.09076682316119e-06, "loss": 10.7993, "step": 122 }, { "epoch": 0.575270388775212, "grad_norm": 0.26758840680122375, "learning_rate": 8.075117370892019e-06, "loss": 11.354, "step": 123 }, { "epoch": 0.5799473838059047, "grad_norm": 0.25225985050201416, "learning_rate": 8.059467918622848e-06, "loss": 11.2162, "step": 124 }, { "epoch": 0.5846243788365975, "grad_norm": 0.22062422335147858, "learning_rate": 8.043818466353678e-06, "loss": 9.9452, "step": 125 }, { "epoch": 0.5893013738672903, "grad_norm": 0.2589726746082306, "learning_rate": 8.028169014084509e-06, "loss": 11.6098, "step": 126 }, { "epoch": 0.5939783688979831, "grad_norm": 0.23492346704006195, "learning_rate": 8.012519561815338e-06, "loss": 10.6918, "step": 127 }, { "epoch": 0.5986553639286758, "grad_norm": 0.29631978273391724, "learning_rate": 7.996870109546167e-06, "loss": 11.4451, "step": 128 }, { "epoch": 0.6033323589593687, "grad_norm": 0.195633202791214, "learning_rate": 7.981220657276996e-06, "loss": 11.3396, "step": 129 }, { "epoch": 0.6080093539900614, "grad_norm": 0.14094115793704987, "learning_rate": 7.965571205007824e-06, "loss": 10.9388, "step": 130 }, { "epoch": 0.6126863490207541, "grad_norm": 0.2307533323764801, "learning_rate": 7.949921752738655e-06, "loss": 12.2129, "step": 131 }, { "epoch": 0.617363344051447, "grad_norm": 0.2004641741514206, "learning_rate": 7.934272300469484e-06, "loss": 9.9139, "step": 132 }, { "epoch": 0.6220403390821397, "grad_norm": 0.22784000635147095, "learning_rate": 7.918622848200314e-06, "loss": 10.2306, "step": 133 }, { "epoch": 0.6267173341128325, "grad_norm": 0.21663011610507965, "learning_rate": 7.902973395931143e-06, "loss": 9.9467, "step": 134 }, { "epoch": 0.6313943291435253, "grad_norm": 0.18714800477027893, "learning_rate": 7.887323943661972e-06, "loss": 9.7232, "step": 135 }, { "epoch": 0.6360713241742181, "grad_norm": 0.23525570333003998, "learning_rate": 7.871674491392802e-06, "loss": 9.9539, "step": 136 }, { "epoch": 0.6407483192049108, "grad_norm": 0.22870206832885742, "learning_rate": 7.856025039123631e-06, "loss": 11.9964, "step": 137 }, { "epoch": 0.6454253142356037, "grad_norm": 0.19730104506015778, "learning_rate": 7.84037558685446e-06, "loss": 10.8391, "step": 138 }, { "epoch": 0.6501023092662964, "grad_norm": 0.1873929351568222, "learning_rate": 7.82472613458529e-06, "loss": 10.7179, "step": 139 }, { "epoch": 0.6547793042969892, "grad_norm": 0.14801403880119324, "learning_rate": 7.809076682316119e-06, "loss": 10.9041, "step": 140 }, { "epoch": 0.659456299327682, "grad_norm": 0.21909023821353912, "learning_rate": 7.79342723004695e-06, "loss": 11.5497, "step": 141 }, { "epoch": 0.6641332943583748, "grad_norm": 0.20469622313976288, "learning_rate": 7.77777777777778e-06, "loss": 11.0387, "step": 142 }, { "epoch": 0.6688102893890675, "grad_norm": 0.20616918802261353, "learning_rate": 7.762128325508608e-06, "loss": 9.5392, "step": 143 }, { "epoch": 0.6734872844197604, "grad_norm": 0.1846546232700348, "learning_rate": 7.746478873239436e-06, "loss": 11.5538, "step": 144 }, { "epoch": 0.6781642794504531, "grad_norm": 0.17778314650058746, "learning_rate": 7.730829420970265e-06, "loss": 12.8435, "step": 145 }, { "epoch": 0.6828412744811458, "grad_norm": 0.24238605797290802, "learning_rate": 7.715179968701096e-06, "loss": 9.4674, "step": 146 }, { "epoch": 0.6875182695118387, "grad_norm": 0.20961545407772064, "learning_rate": 7.699530516431926e-06, "loss": 10.1325, "step": 147 }, { "epoch": 0.6921952645425314, "grad_norm": 0.20476683974266052, "learning_rate": 7.683881064162755e-06, "loss": 11.1375, "step": 148 }, { "epoch": 0.6968722595732242, "grad_norm": 0.22241833806037903, "learning_rate": 7.668231611893584e-06, "loss": 9.6296, "step": 149 }, { "epoch": 0.701549254603917, "grad_norm": 0.2302970439195633, "learning_rate": 7.652582159624414e-06, "loss": 10.8763, "step": 150 }, { "epoch": 0.7062262496346098, "grad_norm": 0.20484097301959991, "learning_rate": 7.636932707355243e-06, "loss": 9.0306, "step": 151 }, { "epoch": 0.7109032446653025, "grad_norm": 0.20411114394664764, "learning_rate": 7.621283255086073e-06, "loss": 11.5865, "step": 152 }, { "epoch": 0.7155802396959953, "grad_norm": 0.37148869037628174, "learning_rate": 7.6056338028169015e-06, "loss": 10.4929, "step": 153 }, { "epoch": 0.7202572347266881, "grad_norm": 0.19864030182361603, "learning_rate": 7.589984350547731e-06, "loss": 10.4561, "step": 154 }, { "epoch": 0.7249342297573809, "grad_norm": 0.21187515556812286, "learning_rate": 7.574334898278561e-06, "loss": 9.6848, "step": 155 }, { "epoch": 0.7296112247880736, "grad_norm": 0.18564990162849426, "learning_rate": 7.55868544600939e-06, "loss": 11.2932, "step": 156 }, { "epoch": 0.7342882198187665, "grad_norm": 0.21274517476558685, "learning_rate": 7.54303599374022e-06, "loss": 10.2206, "step": 157 }, { "epoch": 0.7389652148494592, "grad_norm": 0.23622578382492065, "learning_rate": 7.527386541471049e-06, "loss": 9.4342, "step": 158 }, { "epoch": 0.743642209880152, "grad_norm": 0.21262332797050476, "learning_rate": 7.511737089201878e-06, "loss": 11.4181, "step": 159 }, { "epoch": 0.7483192049108448, "grad_norm": 0.22142890095710754, "learning_rate": 7.496087636932708e-06, "loss": 10.4912, "step": 160 }, { "epoch": 0.7529961999415375, "grad_norm": 0.219626322388649, "learning_rate": 7.480438184663538e-06, "loss": 10.902, "step": 161 }, { "epoch": 0.7576731949722303, "grad_norm": 0.19913645088672638, "learning_rate": 7.464788732394367e-06, "loss": 8.9078, "step": 162 }, { "epoch": 0.7623501900029231, "grad_norm": 0.19409991800785065, "learning_rate": 7.449139280125196e-06, "loss": 10.7111, "step": 163 }, { "epoch": 0.7670271850336159, "grad_norm": 0.20056220889091492, "learning_rate": 7.433489827856026e-06, "loss": 11.438, "step": 164 }, { "epoch": 0.7717041800643086, "grad_norm": 0.19502754509449005, "learning_rate": 7.417840375586856e-06, "loss": 10.1837, "step": 165 }, { "epoch": 0.7763811750950015, "grad_norm": 0.17272567749023438, "learning_rate": 7.402190923317685e-06, "loss": 10.7406, "step": 166 }, { "epoch": 0.7810581701256942, "grad_norm": 0.19558610022068024, "learning_rate": 7.386541471048514e-06, "loss": 10.1322, "step": 167 }, { "epoch": 0.785735165156387, "grad_norm": 0.2161480039358139, "learning_rate": 7.370892018779343e-06, "loss": 9.7506, "step": 168 }, { "epoch": 0.7904121601870798, "grad_norm": 0.25595343112945557, "learning_rate": 7.355242566510172e-06, "loss": 11.0059, "step": 169 }, { "epoch": 0.7950891552177726, "grad_norm": 0.21218866109848022, "learning_rate": 7.339593114241002e-06, "loss": 11.2122, "step": 170 }, { "epoch": 0.7997661502484653, "grad_norm": 0.1922176331281662, "learning_rate": 7.3239436619718316e-06, "loss": 11.0585, "step": 171 }, { "epoch": 0.8044431452791582, "grad_norm": 0.1726471334695816, "learning_rate": 7.308294209702661e-06, "loss": 11.3007, "step": 172 }, { "epoch": 0.8091201403098509, "grad_norm": 0.20865805447101593, "learning_rate": 7.29264475743349e-06, "loss": 12.5848, "step": 173 }, { "epoch": 0.8137971353405437, "grad_norm": 0.2097303569316864, "learning_rate": 7.2769953051643195e-06, "loss": 11.694, "step": 174 }, { "epoch": 0.8184741303712365, "grad_norm": 0.22343699634075165, "learning_rate": 7.26134585289515e-06, "loss": 9.9861, "step": 175 }, { "epoch": 0.8231511254019293, "grad_norm": 0.19908592104911804, "learning_rate": 7.245696400625979e-06, "loss": 10.7263, "step": 176 }, { "epoch": 0.827828120432622, "grad_norm": 0.2062506079673767, "learning_rate": 7.230046948356808e-06, "loss": 10.7234, "step": 177 }, { "epoch": 0.8325051154633148, "grad_norm": 0.23186688125133514, "learning_rate": 7.2143974960876376e-06, "loss": 10.7846, "step": 178 }, { "epoch": 0.8371821104940076, "grad_norm": 0.20528610050678253, "learning_rate": 7.198748043818467e-06, "loss": 10.6732, "step": 179 }, { "epoch": 0.8418591055247003, "grad_norm": 0.21028846502304077, "learning_rate": 7.183098591549297e-06, "loss": 9.5007, "step": 180 }, { "epoch": 0.8465361005553932, "grad_norm": 0.1943686306476593, "learning_rate": 7.167449139280126e-06, "loss": 10.6163, "step": 181 }, { "epoch": 0.8512130955860859, "grad_norm": 0.15791501104831696, "learning_rate": 7.151799687010955e-06, "loss": 10.4564, "step": 182 }, { "epoch": 0.8558900906167787, "grad_norm": 0.15603427588939667, "learning_rate": 7.136150234741784e-06, "loss": 11.4006, "step": 183 }, { "epoch": 0.8605670856474715, "grad_norm": 0.1737872064113617, "learning_rate": 7.120500782472613e-06, "loss": 10.2583, "step": 184 }, { "epoch": 0.8652440806781643, "grad_norm": 0.16742144525051117, "learning_rate": 7.1048513302034435e-06, "loss": 9.6543, "step": 185 }, { "epoch": 0.869921075708857, "grad_norm": 0.2204071581363678, "learning_rate": 7.089201877934273e-06, "loss": 10.6068, "step": 186 }, { "epoch": 0.8745980707395499, "grad_norm": 0.17526549100875854, "learning_rate": 7.073552425665102e-06, "loss": 10.5927, "step": 187 }, { "epoch": 0.8792750657702426, "grad_norm": 0.18857762217521667, "learning_rate": 7.0579029733959315e-06, "loss": 10.0686, "step": 188 }, { "epoch": 0.8839520608009354, "grad_norm": 0.16617538034915924, "learning_rate": 7.042253521126761e-06, "loss": 11.0356, "step": 189 }, { "epoch": 0.8886290558316282, "grad_norm": 0.20443867146968842, "learning_rate": 7.026604068857591e-06, "loss": 9.764, "step": 190 }, { "epoch": 0.893306050862321, "grad_norm": 0.16466206312179565, "learning_rate": 7.01095461658842e-06, "loss": 8.9783, "step": 191 }, { "epoch": 0.8979830458930137, "grad_norm": 0.2051703780889511, "learning_rate": 6.9953051643192495e-06, "loss": 10.5345, "step": 192 }, { "epoch": 0.9026600409237066, "grad_norm": 0.19935429096221924, "learning_rate": 6.979655712050079e-06, "loss": 10.1047, "step": 193 }, { "epoch": 0.9073370359543993, "grad_norm": 0.14471961557865143, "learning_rate": 6.964006259780907e-06, "loss": 8.9315, "step": 194 }, { "epoch": 0.912014030985092, "grad_norm": 0.21026520431041718, "learning_rate": 6.948356807511738e-06, "loss": 11.0192, "step": 195 }, { "epoch": 0.9166910260157849, "grad_norm": 0.22124925255775452, "learning_rate": 6.932707355242568e-06, "loss": 10.7211, "step": 196 }, { "epoch": 0.9213680210464776, "grad_norm": 0.6166573166847229, "learning_rate": 6.917057902973396e-06, "loss": 10.1654, "step": 197 }, { "epoch": 0.9260450160771704, "grad_norm": 0.14892670512199402, "learning_rate": 6.901408450704225e-06, "loss": 9.6949, "step": 198 }, { "epoch": 0.9307220111078632, "grad_norm": 0.17058013379573822, "learning_rate": 6.885758998435055e-06, "loss": 9.9864, "step": 199 }, { "epoch": 0.935399006138556, "grad_norm": 0.19176752865314484, "learning_rate": 6.870109546165885e-06, "loss": 9.7219, "step": 200 }, { "epoch": 0.9400760011692487, "grad_norm": 0.1923060268163681, "learning_rate": 6.854460093896714e-06, "loss": 9.0111, "step": 201 }, { "epoch": 0.9447529961999416, "grad_norm": 0.22771762311458588, "learning_rate": 6.8388106416275434e-06, "loss": 9.9277, "step": 202 }, { "epoch": 0.9494299912306343, "grad_norm": 0.21972382068634033, "learning_rate": 6.823161189358373e-06, "loss": 10.5451, "step": 203 }, { "epoch": 0.9541069862613271, "grad_norm": 0.32944294810295105, "learning_rate": 6.807511737089203e-06, "loss": 9.8053, "step": 204 }, { "epoch": 0.9587839812920199, "grad_norm": 0.1875985562801361, "learning_rate": 6.791862284820032e-06, "loss": 10.3256, "step": 205 }, { "epoch": 0.9634609763227127, "grad_norm": 0.17583012580871582, "learning_rate": 6.7762128325508615e-06, "loss": 10.4922, "step": 206 }, { "epoch": 0.9681379713534054, "grad_norm": 0.22149552404880524, "learning_rate": 6.760563380281691e-06, "loss": 10.1547, "step": 207 }, { "epoch": 0.9728149663840983, "grad_norm": 0.18506276607513428, "learning_rate": 6.74491392801252e-06, "loss": 10.5188, "step": 208 }, { "epoch": 0.977491961414791, "grad_norm": 0.21199573576450348, "learning_rate": 6.72926447574335e-06, "loss": 11.3258, "step": 209 }, { "epoch": 0.9821689564454837, "grad_norm": 0.18747669458389282, "learning_rate": 6.71361502347418e-06, "loss": 10.251, "step": 210 }, { "epoch": 0.9868459514761766, "grad_norm": 0.1887262761592865, "learning_rate": 6.697965571205008e-06, "loss": 9.2012, "step": 211 }, { "epoch": 0.9915229465068693, "grad_norm": 0.16557927429676056, "learning_rate": 6.682316118935837e-06, "loss": 9.2171, "step": 212 }, { "epoch": 0.9961999415375621, "grad_norm": 0.19340123236179352, "learning_rate": 6.666666666666667e-06, "loss": 9.4988, "step": 213 }, { "epoch": 1.0046769950306929, "grad_norm": 0.3001099228858948, "learning_rate": 6.651017214397497e-06, "loss": 11.8577, "step": 214 }, { "epoch": 1.0093539900613855, "grad_norm": 0.18085287511348724, "learning_rate": 6.635367762128326e-06, "loss": 10.4356, "step": 215 }, { "epoch": 1.0140309850920783, "grad_norm": 0.17791183292865753, "learning_rate": 6.619718309859155e-06, "loss": 10.3929, "step": 216 }, { "epoch": 1.0187079801227712, "grad_norm": 0.20649202167987823, "learning_rate": 6.604068857589985e-06, "loss": 9.342, "step": 217 }, { "epoch": 1.0233849751534638, "grad_norm": 0.2049955129623413, "learning_rate": 6.588419405320814e-06, "loss": 10.656, "step": 218 }, { "epoch": 1.0280619701841567, "grad_norm": 0.18064165115356445, "learning_rate": 6.572769953051644e-06, "loss": 10.1633, "step": 219 }, { "epoch": 1.0327389652148495, "grad_norm": 0.1652020812034607, "learning_rate": 6.5571205007824735e-06, "loss": 8.9937, "step": 220 }, { "epoch": 1.0374159602455422, "grad_norm": 0.16658996045589447, "learning_rate": 6.541471048513303e-06, "loss": 11.0051, "step": 221 }, { "epoch": 1.042092955276235, "grad_norm": 0.1875378042459488, "learning_rate": 6.525821596244132e-06, "loss": 9.7089, "step": 222 }, { "epoch": 1.0467699503069279, "grad_norm": 0.19267050921916962, "learning_rate": 6.510172143974961e-06, "loss": 10.0252, "step": 223 }, { "epoch": 1.0514469453376205, "grad_norm": 0.2656681537628174, "learning_rate": 6.4945226917057916e-06, "loss": 9.7082, "step": 224 }, { "epoch": 1.0561239403683134, "grad_norm": 0.16058804094791412, "learning_rate": 6.478873239436621e-06, "loss": 9.6689, "step": 225 }, { "epoch": 1.0608009353990062, "grad_norm": 0.14145280420780182, "learning_rate": 6.463223787167449e-06, "loss": 8.6923, "step": 226 }, { "epoch": 1.0654779304296988, "grad_norm": 0.14217382669448853, "learning_rate": 6.447574334898279e-06, "loss": 10.4302, "step": 227 }, { "epoch": 1.0701549254603917, "grad_norm": 0.18387371301651, "learning_rate": 6.431924882629108e-06, "loss": 9.5514, "step": 228 }, { "epoch": 1.0748319204910846, "grad_norm": 0.15731996297836304, "learning_rate": 6.416275430359938e-06, "loss": 9.2854, "step": 229 }, { "epoch": 1.0795089155217772, "grad_norm": 0.1794990450143814, "learning_rate": 6.400625978090767e-06, "loss": 11.0837, "step": 230 }, { "epoch": 1.08418591055247, "grad_norm": 0.19289837777614594, "learning_rate": 6.384976525821597e-06, "loss": 9.3129, "step": 231 }, { "epoch": 1.088862905583163, "grad_norm": 0.1858958899974823, "learning_rate": 6.369327073552426e-06, "loss": 10.7238, "step": 232 }, { "epoch": 1.0935399006138555, "grad_norm": 0.26388686895370483, "learning_rate": 6.353677621283255e-06, "loss": 9.2242, "step": 233 }, { "epoch": 1.0982168956445484, "grad_norm": 0.17551296949386597, "learning_rate": 6.3380281690140855e-06, "loss": 8.3665, "step": 234 }, { "epoch": 1.1028938906752412, "grad_norm": 0.20290863513946533, "learning_rate": 6.322378716744915e-06, "loss": 9.6916, "step": 235 }, { "epoch": 1.1075708857059339, "grad_norm": 0.11323179304599762, "learning_rate": 6.306729264475744e-06, "loss": 10.218, "step": 236 }, { "epoch": 1.1122478807366267, "grad_norm": 0.22893109917640686, "learning_rate": 6.291079812206573e-06, "loss": 10.3068, "step": 237 }, { "epoch": 1.1169248757673196, "grad_norm": 0.1943362057209015, "learning_rate": 6.275430359937402e-06, "loss": 9.738, "step": 238 }, { "epoch": 1.1216018707980122, "grad_norm": 0.22017931938171387, "learning_rate": 6.259780907668233e-06, "loss": 8.5765, "step": 239 }, { "epoch": 1.126278865828705, "grad_norm": 0.1584814190864563, "learning_rate": 6.244131455399062e-06, "loss": 11.0436, "step": 240 }, { "epoch": 1.130955860859398, "grad_norm": 0.182816743850708, "learning_rate": 6.228482003129891e-06, "loss": 11.1518, "step": 241 }, { "epoch": 1.1356328558900906, "grad_norm": 0.21375828981399536, "learning_rate": 6.21283255086072e-06, "loss": 10.0972, "step": 242 }, { "epoch": 1.1403098509207834, "grad_norm": 0.1926356703042984, "learning_rate": 6.197183098591549e-06, "loss": 9.0861, "step": 243 }, { "epoch": 1.1449868459514763, "grad_norm": 0.13788476586341858, "learning_rate": 6.181533646322379e-06, "loss": 9.1896, "step": 244 }, { "epoch": 1.149663840982169, "grad_norm": 0.24886344373226166, "learning_rate": 6.165884194053209e-06, "loss": 8.9126, "step": 245 }, { "epoch": 1.1543408360128617, "grad_norm": 0.21492387354373932, "learning_rate": 6.150234741784038e-06, "loss": 9.1809, "step": 246 }, { "epoch": 1.1590178310435546, "grad_norm": 0.20666466653347015, "learning_rate": 6.134585289514867e-06, "loss": 9.8609, "step": 247 }, { "epoch": 1.1636948260742472, "grad_norm": 0.12884530425071716, "learning_rate": 6.118935837245697e-06, "loss": 9.0015, "step": 248 }, { "epoch": 1.16837182110494, "grad_norm": 0.2109869420528412, "learning_rate": 6.103286384976527e-06, "loss": 8.4398, "step": 249 }, { "epoch": 1.173048816135633, "grad_norm": 0.1602170467376709, "learning_rate": 6.087636932707356e-06, "loss": 8.9123, "step": 250 }, { "epoch": 1.1777258111663256, "grad_norm": 0.1901443898677826, "learning_rate": 6.071987480438185e-06, "loss": 9.3279, "step": 251 }, { "epoch": 1.1824028061970184, "grad_norm": 0.12106055021286011, "learning_rate": 6.056338028169015e-06, "loss": 8.8215, "step": 252 }, { "epoch": 1.1870798012277113, "grad_norm": 0.15600277483463287, "learning_rate": 6.040688575899843e-06, "loss": 9.5461, "step": 253 }, { "epoch": 1.191756796258404, "grad_norm": 0.211564302444458, "learning_rate": 6.025039123630674e-06, "loss": 9.9196, "step": 254 }, { "epoch": 1.1964337912890968, "grad_norm": 0.16480544209480286, "learning_rate": 6.0093896713615026e-06, "loss": 9.488, "step": 255 }, { "epoch": 1.2011107863197896, "grad_norm": 0.22194457054138184, "learning_rate": 5.993740219092332e-06, "loss": 10.415, "step": 256 }, { "epoch": 1.2057877813504823, "grad_norm": 0.27972927689552307, "learning_rate": 5.978090766823161e-06, "loss": 9.3022, "step": 257 }, { "epoch": 1.2104647763811751, "grad_norm": 0.23484700918197632, "learning_rate": 5.9624413145539905e-06, "loss": 8.218, "step": 258 }, { "epoch": 1.215141771411868, "grad_norm": 0.20119240880012512, "learning_rate": 5.946791862284821e-06, "loss": 8.204, "step": 259 }, { "epoch": 1.2198187664425606, "grad_norm": 0.19867953658103943, "learning_rate": 5.93114241001565e-06, "loss": 9.4491, "step": 260 }, { "epoch": 1.2244957614732535, "grad_norm": 0.19878610968589783, "learning_rate": 5.915492957746479e-06, "loss": 10.683, "step": 261 }, { "epoch": 1.2291727565039463, "grad_norm": 0.18710929155349731, "learning_rate": 5.8998435054773086e-06, "loss": 10.2426, "step": 262 }, { "epoch": 1.233849751534639, "grad_norm": 0.1873483806848526, "learning_rate": 5.884194053208139e-06, "loss": 10.1553, "step": 263 }, { "epoch": 1.2385267465653318, "grad_norm": 0.26153287291526794, "learning_rate": 5.868544600938968e-06, "loss": 9.8046, "step": 264 }, { "epoch": 1.2432037415960246, "grad_norm": 0.17956022918224335, "learning_rate": 5.852895148669797e-06, "loss": 9.2137, "step": 265 }, { "epoch": 1.2478807366267173, "grad_norm": 0.15572352707386017, "learning_rate": 5.837245696400627e-06, "loss": 9.2382, "step": 266 }, { "epoch": 1.2525577316574101, "grad_norm": 0.16768573224544525, "learning_rate": 5.821596244131456e-06, "loss": 10.1462, "step": 267 }, { "epoch": 1.257234726688103, "grad_norm": 0.14606249332427979, "learning_rate": 5.805946791862286e-06, "loss": 9.6735, "step": 268 }, { "epoch": 1.2619117217187956, "grad_norm": 0.20985975861549377, "learning_rate": 5.790297339593115e-06, "loss": 10.9061, "step": 269 }, { "epoch": 1.2665887167494885, "grad_norm": 0.17635460197925568, "learning_rate": 5.774647887323944e-06, "loss": 9.1385, "step": 270 }, { "epoch": 1.271265711780181, "grad_norm": 0.19080878794193268, "learning_rate": 5.758998435054773e-06, "loss": 9.8189, "step": 271 }, { "epoch": 1.275942706810874, "grad_norm": 0.1511276364326477, "learning_rate": 5.7433489827856025e-06, "loss": 9.9191, "step": 272 }, { "epoch": 1.2806197018415668, "grad_norm": 0.2525511085987091, "learning_rate": 5.727699530516433e-06, "loss": 8.7398, "step": 273 }, { "epoch": 1.2852966968722597, "grad_norm": 0.18259669840335846, "learning_rate": 5.712050078247262e-06, "loss": 10.7875, "step": 274 }, { "epoch": 1.2899736919029523, "grad_norm": 0.2251911461353302, "learning_rate": 5.696400625978091e-06, "loss": 8.9997, "step": 275 }, { "epoch": 1.2946506869336452, "grad_norm": 0.17306119203567505, "learning_rate": 5.6807511737089205e-06, "loss": 10.0071, "step": 276 }, { "epoch": 1.2993276819643378, "grad_norm": 0.23585619032382965, "learning_rate": 5.66510172143975e-06, "loss": 9.5575, "step": 277 }, { "epoch": 1.3040046769950306, "grad_norm": 0.2100452035665512, "learning_rate": 5.64945226917058e-06, "loss": 9.6862, "step": 278 }, { "epoch": 1.3086816720257235, "grad_norm": 0.19781209528446198, "learning_rate": 5.633802816901409e-06, "loss": 9.6712, "step": 279 }, { "epoch": 1.3133586670564164, "grad_norm": 0.20990189909934998, "learning_rate": 5.618153364632239e-06, "loss": 9.1145, "step": 280 }, { "epoch": 1.318035662087109, "grad_norm": 0.14471188187599182, "learning_rate": 5.602503912363068e-06, "loss": 10.0124, "step": 281 }, { "epoch": 1.3227126571178018, "grad_norm": 0.181657612323761, "learning_rate": 5.586854460093896e-06, "loss": 8.5702, "step": 282 }, { "epoch": 1.3273896521484945, "grad_norm": 0.28895941376686096, "learning_rate": 5.571205007824727e-06, "loss": 8.7288, "step": 283 }, { "epoch": 1.3320666471791873, "grad_norm": 0.19658011198043823, "learning_rate": 5.555555555555557e-06, "loss": 10.2721, "step": 284 }, { "epoch": 1.3367436422098802, "grad_norm": 0.1778428554534912, "learning_rate": 5.539906103286385e-06, "loss": 8.6042, "step": 285 }, { "epoch": 1.341420637240573, "grad_norm": 0.1622474491596222, "learning_rate": 5.5242566510172144e-06, "loss": 9.0871, "step": 286 }, { "epoch": 1.3460976322712657, "grad_norm": 0.17768928408622742, "learning_rate": 5.508607198748044e-06, "loss": 9.1438, "step": 287 }, { "epoch": 1.3507746273019585, "grad_norm": 0.15472590923309326, "learning_rate": 5.492957746478874e-06, "loss": 8.1626, "step": 288 }, { "epoch": 1.3554516223326512, "grad_norm": 0.151944100856781, "learning_rate": 5.477308294209703e-06, "loss": 10.6628, "step": 289 }, { "epoch": 1.360128617363344, "grad_norm": 0.2412179410457611, "learning_rate": 5.4616588419405325e-06, "loss": 10.0811, "step": 290 }, { "epoch": 1.3648056123940369, "grad_norm": 0.1254899650812149, "learning_rate": 5.446009389671362e-06, "loss": 8.7967, "step": 291 }, { "epoch": 1.3694826074247297, "grad_norm": 0.1940433233976364, "learning_rate": 5.430359937402191e-06, "loss": 10.7896, "step": 292 }, { "epoch": 1.3741596024554223, "grad_norm": 0.23099660873413086, "learning_rate": 5.414710485133021e-06, "loss": 10.3398, "step": 293 }, { "epoch": 1.3788365974861152, "grad_norm": 0.14648781716823578, "learning_rate": 5.3990610328638506e-06, "loss": 9.3573, "step": 294 }, { "epoch": 1.3835135925168078, "grad_norm": 0.18853303790092468, "learning_rate": 5.38341158059468e-06, "loss": 9.8656, "step": 295 }, { "epoch": 1.3881905875475007, "grad_norm": 0.20366129279136658, "learning_rate": 5.367762128325509e-06, "loss": 10.2061, "step": 296 }, { "epoch": 1.3928675825781935, "grad_norm": 0.18720601499080658, "learning_rate": 5.352112676056338e-06, "loss": 8.4737, "step": 297 }, { "epoch": 1.3975445776088864, "grad_norm": 0.1396239697933197, "learning_rate": 5.336463223787169e-06, "loss": 9.3009, "step": 298 }, { "epoch": 1.402221572639579, "grad_norm": 0.19741852581501007, "learning_rate": 5.320813771517997e-06, "loss": 9.7318, "step": 299 }, { "epoch": 1.4068985676702719, "grad_norm": 0.1550920307636261, "learning_rate": 5.305164319248826e-06, "loss": 9.0948, "step": 300 }, { "epoch": 1.4115755627009645, "grad_norm": 0.20845593512058258, "learning_rate": 5.289514866979656e-06, "loss": 8.555, "step": 301 }, { "epoch": 1.4162525577316574, "grad_norm": 0.15616929531097412, "learning_rate": 5.273865414710485e-06, "loss": 9.293, "step": 302 }, { "epoch": 1.4209295527623502, "grad_norm": 0.18581336736679077, "learning_rate": 5.258215962441315e-06, "loss": 8.6798, "step": 303 }, { "epoch": 1.425606547793043, "grad_norm": 0.14762163162231445, "learning_rate": 5.2425665101721445e-06, "loss": 7.7574, "step": 304 }, { "epoch": 1.4302835428237357, "grad_norm": 0.11617639660835266, "learning_rate": 5.226917057902974e-06, "loss": 9.9937, "step": 305 }, { "epoch": 1.4349605378544286, "grad_norm": 0.12888303399085999, "learning_rate": 5.211267605633803e-06, "loss": 9.5393, "step": 306 }, { "epoch": 1.4396375328851212, "grad_norm": 0.14450183510780334, "learning_rate": 5.195618153364632e-06, "loss": 10.9441, "step": 307 }, { "epoch": 1.444314527915814, "grad_norm": 0.20856888592243195, "learning_rate": 5.1799687010954625e-06, "loss": 9.6833, "step": 308 }, { "epoch": 1.448991522946507, "grad_norm": 0.23422713577747345, "learning_rate": 5.164319248826292e-06, "loss": 9.2532, "step": 309 }, { "epoch": 1.4536685179771998, "grad_norm": 0.19145800173282623, "learning_rate": 5.148669796557121e-06, "loss": 9.7285, "step": 310 }, { "epoch": 1.4583455130078924, "grad_norm": 0.1990247666835785, "learning_rate": 5.1330203442879505e-06, "loss": 7.6512, "step": 311 }, { "epoch": 1.4630225080385852, "grad_norm": 0.17829596996307373, "learning_rate": 5.117370892018779e-06, "loss": 9.5529, "step": 312 }, { "epoch": 1.4676995030692779, "grad_norm": 0.162981778383255, "learning_rate": 5.10172143974961e-06, "loss": 10.0274, "step": 313 }, { "epoch": 1.4723764980999707, "grad_norm": 0.17965111136436462, "learning_rate": 5.086071987480438e-06, "loss": 9.2513, "step": 314 }, { "epoch": 1.4770534931306636, "grad_norm": 0.28804492950439453, "learning_rate": 5.070422535211268e-06, "loss": 10.0194, "step": 315 }, { "epoch": 1.4817304881613564, "grad_norm": 0.1571478545665741, "learning_rate": 5.054773082942097e-06, "loss": 10.0889, "step": 316 }, { "epoch": 1.486407483192049, "grad_norm": 0.2101372927427292, "learning_rate": 5.039123630672926e-06, "loss": 8.6775, "step": 317 }, { "epoch": 1.491084478222742, "grad_norm": 0.20323887467384338, "learning_rate": 5.0234741784037565e-06, "loss": 9.8082, "step": 318 }, { "epoch": 1.4957614732534346, "grad_norm": 0.16192995011806488, "learning_rate": 5.007824726134586e-06, "loss": 8.0025, "step": 319 }, { "epoch": 1.5004384682841274, "grad_norm": 0.16440463066101074, "learning_rate": 4.992175273865415e-06, "loss": 9.579, "step": 320 }, { "epoch": 1.5051154633148203, "grad_norm": 0.19055482745170593, "learning_rate": 4.976525821596244e-06, "loss": 8.7398, "step": 321 }, { "epoch": 1.5097924583455131, "grad_norm": 0.17318573594093323, "learning_rate": 4.960876369327074e-06, "loss": 9.7488, "step": 322 }, { "epoch": 1.5144694533762058, "grad_norm": 0.24867770075798035, "learning_rate": 4.945226917057903e-06, "loss": 10.5706, "step": 323 }, { "epoch": 1.5191464484068986, "grad_norm": 0.1796032041311264, "learning_rate": 4.929577464788733e-06, "loss": 9.4351, "step": 324 }, { "epoch": 1.5238234434375912, "grad_norm": 0.21675661206245422, "learning_rate": 4.9139280125195624e-06, "loss": 10.6771, "step": 325 }, { "epoch": 1.528500438468284, "grad_norm": 0.17892418801784515, "learning_rate": 4.898278560250392e-06, "loss": 7.6976, "step": 326 }, { "epoch": 1.533177433498977, "grad_norm": 0.16854748129844666, "learning_rate": 4.882629107981221e-06, "loss": 9.0202, "step": 327 }, { "epoch": 1.5378544285296698, "grad_norm": 0.20898739993572235, "learning_rate": 4.86697965571205e-06, "loss": 9.3772, "step": 328 }, { "epoch": 1.5425314235603624, "grad_norm": 0.2980878949165344, "learning_rate": 4.85133020344288e-06, "loss": 10.5012, "step": 329 }, { "epoch": 1.5472084185910553, "grad_norm": 0.12076615542173386, "learning_rate": 4.835680751173709e-06, "loss": 10.1389, "step": 330 }, { "epoch": 1.551885413621748, "grad_norm": 0.1814320981502533, "learning_rate": 4.820031298904539e-06, "loss": 8.4015, "step": 331 }, { "epoch": 1.5565624086524408, "grad_norm": 0.16422027349472046, "learning_rate": 4.8043818466353684e-06, "loss": 8.4772, "step": 332 }, { "epoch": 1.5612394036831336, "grad_norm": 0.12222316116094589, "learning_rate": 4.788732394366197e-06, "loss": 8.7358, "step": 333 }, { "epoch": 1.5659163987138265, "grad_norm": 0.20471377670764923, "learning_rate": 4.773082942097027e-06, "loss": 8.9805, "step": 334 }, { "epoch": 1.5705933937445191, "grad_norm": 0.1602873057126999, "learning_rate": 4.757433489827856e-06, "loss": 7.7731, "step": 335 }, { "epoch": 1.575270388775212, "grad_norm": 0.1620335578918457, "learning_rate": 4.741784037558686e-06, "loss": 8.5971, "step": 336 }, { "epoch": 1.5799473838059046, "grad_norm": 0.14822766184806824, "learning_rate": 4.726134585289515e-06, "loss": 8.1521, "step": 337 }, { "epoch": 1.5846243788365975, "grad_norm": 0.16832107305526733, "learning_rate": 4.710485133020345e-06, "loss": 9.0838, "step": 338 }, { "epoch": 1.5893013738672903, "grad_norm": 0.1385219246149063, "learning_rate": 4.694835680751174e-06, "loss": 7.4367, "step": 339 }, { "epoch": 1.5939783688979832, "grad_norm": 0.13664643466472626, "learning_rate": 4.679186228482004e-06, "loss": 8.5027, "step": 340 }, { "epoch": 1.5986553639286758, "grad_norm": 0.18891537189483643, "learning_rate": 4.663536776212833e-06, "loss": 8.6301, "step": 341 }, { "epoch": 1.6033323589593687, "grad_norm": 0.19962970912456512, "learning_rate": 4.647887323943662e-06, "loss": 10.6293, "step": 342 }, { "epoch": 1.6080093539900613, "grad_norm": 0.18747878074645996, "learning_rate": 4.632237871674492e-06, "loss": 10.0322, "step": 343 }, { "epoch": 1.6126863490207541, "grad_norm": 0.3010605573654175, "learning_rate": 4.616588419405321e-06, "loss": 9.1209, "step": 344 }, { "epoch": 1.617363344051447, "grad_norm": 0.11245454847812653, "learning_rate": 4.60093896713615e-06, "loss": 8.0594, "step": 345 }, { "epoch": 1.6220403390821398, "grad_norm": 0.20886649191379547, "learning_rate": 4.58528951486698e-06, "loss": 9.1715, "step": 346 }, { "epoch": 1.6267173341128325, "grad_norm": 0.14630508422851562, "learning_rate": 4.56964006259781e-06, "loss": 8.7735, "step": 347 }, { "epoch": 1.6313943291435253, "grad_norm": 0.21093368530273438, "learning_rate": 4.553990610328639e-06, "loss": 8.2183, "step": 348 }, { "epoch": 1.636071324174218, "grad_norm": 0.22136329114437103, "learning_rate": 4.538341158059468e-06, "loss": 9.067, "step": 349 }, { "epoch": 1.6407483192049108, "grad_norm": 0.15906454622745514, "learning_rate": 4.522691705790298e-06, "loss": 9.3209, "step": 350 }, { "epoch": 1.6454253142356037, "grad_norm": 0.2312268763780594, "learning_rate": 4.507042253521127e-06, "loss": 9.2316, "step": 351 }, { "epoch": 1.6501023092662965, "grad_norm": 0.24528440833091736, "learning_rate": 4.491392801251956e-06, "loss": 9.0482, "step": 352 }, { "epoch": 1.6547793042969892, "grad_norm": 0.19777342677116394, "learning_rate": 4.475743348982786e-06, "loss": 10.1556, "step": 353 }, { "epoch": 1.659456299327682, "grad_norm": 0.2033587247133255, "learning_rate": 4.460093896713616e-06, "loss": 8.9973, "step": 354 }, { "epoch": 1.6641332943583746, "grad_norm": 0.16927585005760193, "learning_rate": 4.444444444444444e-06, "loss": 9.5144, "step": 355 }, { "epoch": 1.6688102893890675, "grad_norm": 0.16959340870380402, "learning_rate": 4.428794992175274e-06, "loss": 9.5447, "step": 356 }, { "epoch": 1.6734872844197604, "grad_norm": 0.18593505024909973, "learning_rate": 4.413145539906104e-06, "loss": 9.6471, "step": 357 }, { "epoch": 1.6781642794504532, "grad_norm": 0.16945506632328033, "learning_rate": 4.397496087636933e-06, "loss": 8.5418, "step": 358 }, { "epoch": 1.6828412744811458, "grad_norm": 0.16277293860912323, "learning_rate": 4.381846635367762e-06, "loss": 9.2884, "step": 359 }, { "epoch": 1.6875182695118387, "grad_norm": 0.2155790776014328, "learning_rate": 4.3661971830985915e-06, "loss": 9.4547, "step": 360 }, { "epoch": 1.6921952645425313, "grad_norm": 0.19257700443267822, "learning_rate": 4.350547730829422e-06, "loss": 8.7859, "step": 361 }, { "epoch": 1.6968722595732242, "grad_norm": 0.21113352477550507, "learning_rate": 4.334898278560251e-06, "loss": 9.3654, "step": 362 }, { "epoch": 1.701549254603917, "grad_norm": 0.17781415581703186, "learning_rate": 4.31924882629108e-06, "loss": 9.5482, "step": 363 }, { "epoch": 1.70622624963461, "grad_norm": 0.14610658586025238, "learning_rate": 4.30359937402191e-06, "loss": 9.2182, "step": 364 }, { "epoch": 1.7109032446653025, "grad_norm": 0.19297371804714203, "learning_rate": 4.287949921752739e-06, "loss": 8.5858, "step": 365 }, { "epoch": 1.7155802396959952, "grad_norm": 0.16764657199382782, "learning_rate": 4.272300469483568e-06, "loss": 8.6679, "step": 366 }, { "epoch": 1.720257234726688, "grad_norm": 0.1740255355834961, "learning_rate": 4.2566510172143975e-06, "loss": 8.3984, "step": 367 }, { "epoch": 1.7249342297573809, "grad_norm": 0.2171589732170105, "learning_rate": 4.241001564945228e-06, "loss": 8.6767, "step": 368 }, { "epoch": 1.7296112247880737, "grad_norm": 0.15334008634090424, "learning_rate": 4.225352112676057e-06, "loss": 9.0357, "step": 369 }, { "epoch": 1.7342882198187666, "grad_norm": 0.1901715248823166, "learning_rate": 4.209702660406886e-06, "loss": 9.1397, "step": 370 }, { "epoch": 1.7389652148494592, "grad_norm": 0.14479465782642365, "learning_rate": 4.194053208137716e-06, "loss": 8.0689, "step": 371 }, { "epoch": 1.7436422098801518, "grad_norm": 0.13776177167892456, "learning_rate": 4.178403755868545e-06, "loss": 8.2216, "step": 372 }, { "epoch": 1.7483192049108447, "grad_norm": 0.13980716466903687, "learning_rate": 4.162754303599374e-06, "loss": 10.2694, "step": 373 }, { "epoch": 1.7529961999415375, "grad_norm": 0.15243536233901978, "learning_rate": 4.1471048513302035e-06, "loss": 8.4832, "step": 374 }, { "epoch": 1.7576731949722304, "grad_norm": 0.1408737152814865, "learning_rate": 4.131455399061034e-06, "loss": 10.5995, "step": 375 }, { "epoch": 1.7623501900029233, "grad_norm": 0.16743288934230804, "learning_rate": 4.115805946791863e-06, "loss": 9.0306, "step": 376 }, { "epoch": 1.7670271850336159, "grad_norm": 0.13096289336681366, "learning_rate": 4.100156494522691e-06, "loss": 8.799, "step": 377 }, { "epoch": 1.7717041800643085, "grad_norm": 0.18536189198493958, "learning_rate": 4.0845070422535216e-06, "loss": 8.6714, "step": 378 }, { "epoch": 1.7763811750950014, "grad_norm": 0.21224500238895416, "learning_rate": 4.068857589984351e-06, "loss": 8.8822, "step": 379 }, { "epoch": 1.7810581701256942, "grad_norm": 0.15303047001361847, "learning_rate": 4.05320813771518e-06, "loss": 8.8666, "step": 380 }, { "epoch": 1.785735165156387, "grad_norm": 0.14419591426849365, "learning_rate": 4.0375586854460095e-06, "loss": 8.916, "step": 381 }, { "epoch": 1.79041216018708, "grad_norm": 0.1363951712846756, "learning_rate": 4.021909233176839e-06, "loss": 8.3857, "step": 382 }, { "epoch": 1.7950891552177726, "grad_norm": 0.20621058344841003, "learning_rate": 4.006259780907669e-06, "loss": 10.1237, "step": 383 }, { "epoch": 1.7997661502484652, "grad_norm": 0.21105414628982544, "learning_rate": 3.990610328638498e-06, "loss": 9.5554, "step": 384 }, { "epoch": 1.804443145279158, "grad_norm": 0.21915097534656525, "learning_rate": 3.9749608763693276e-06, "loss": 7.717, "step": 385 }, { "epoch": 1.809120140309851, "grad_norm": 0.17555522918701172, "learning_rate": 3.959311424100157e-06, "loss": 9.1899, "step": 386 }, { "epoch": 1.8137971353405438, "grad_norm": 0.1890765279531479, "learning_rate": 3.943661971830986e-06, "loss": 8.0672, "step": 387 }, { "epoch": 1.8184741303712366, "grad_norm": 0.16451717913150787, "learning_rate": 3.9280125195618155e-06, "loss": 8.8205, "step": 388 }, { "epoch": 1.8231511254019293, "grad_norm": 0.16023708879947662, "learning_rate": 3.912363067292645e-06, "loss": 9.319, "step": 389 }, { "epoch": 1.8278281204326219, "grad_norm": 0.15548115968704224, "learning_rate": 3.896713615023475e-06, "loss": 8.2246, "step": 390 }, { "epoch": 1.8325051154633147, "grad_norm": 0.21226494014263153, "learning_rate": 3.881064162754304e-06, "loss": 9.135, "step": 391 }, { "epoch": 1.8371821104940076, "grad_norm": 0.14461496472358704, "learning_rate": 3.865414710485133e-06, "loss": 8.962, "step": 392 }, { "epoch": 1.8418591055247004, "grad_norm": 0.20766492187976837, "learning_rate": 3.849765258215963e-06, "loss": 8.8991, "step": 393 }, { "epoch": 1.8465361005553933, "grad_norm": 0.20327630639076233, "learning_rate": 3.834115805946792e-06, "loss": 9.1291, "step": 394 }, { "epoch": 1.851213095586086, "grad_norm": 0.23052388429641724, "learning_rate": 3.8184663536776215e-06, "loss": 8.3602, "step": 395 }, { "epoch": 1.8558900906167786, "grad_norm": 0.16140541434288025, "learning_rate": 3.8028169014084508e-06, "loss": 9.3176, "step": 396 }, { "epoch": 1.8605670856474714, "grad_norm": 0.17049185931682587, "learning_rate": 3.7871674491392805e-06, "loss": 8.6602, "step": 397 }, { "epoch": 1.8652440806781643, "grad_norm": 0.11496849358081818, "learning_rate": 3.77151799687011e-06, "loss": 10.3293, "step": 398 }, { "epoch": 1.8699210757088571, "grad_norm": 0.1907191127538681, "learning_rate": 3.755868544600939e-06, "loss": 8.4035, "step": 399 }, { "epoch": 1.87459807073955, "grad_norm": 0.16409359872341156, "learning_rate": 3.740219092331769e-06, "loss": 8.9062, "step": 400 }, { "epoch": 1.8792750657702426, "grad_norm": 0.15642918646335602, "learning_rate": 3.724569640062598e-06, "loss": 8.8751, "step": 401 }, { "epoch": 1.8839520608009352, "grad_norm": 0.1641726940870285, "learning_rate": 3.708920187793428e-06, "loss": 8.3851, "step": 402 }, { "epoch": 1.888629055831628, "grad_norm": 0.15342937409877777, "learning_rate": 3.693270735524257e-06, "loss": 9.3965, "step": 403 }, { "epoch": 1.893306050862321, "grad_norm": 0.15916384756565094, "learning_rate": 3.677621283255086e-06, "loss": 8.7446, "step": 404 }, { "epoch": 1.8979830458930138, "grad_norm": 0.21401815116405487, "learning_rate": 3.6619718309859158e-06, "loss": 8.8994, "step": 405 }, { "epoch": 1.9026600409237067, "grad_norm": 0.19148550927639008, "learning_rate": 3.646322378716745e-06, "loss": 8.5996, "step": 406 }, { "epoch": 1.9073370359543993, "grad_norm": 0.1755845844745636, "learning_rate": 3.630672926447575e-06, "loss": 8.7611, "step": 407 }, { "epoch": 1.912014030985092, "grad_norm": 0.17193089425563812, "learning_rate": 3.615023474178404e-06, "loss": 8.9488, "step": 408 }, { "epoch": 1.9166910260157848, "grad_norm": 0.17173364758491516, "learning_rate": 3.5993740219092334e-06, "loss": 8.0517, "step": 409 }, { "epoch": 1.9213680210464776, "grad_norm": 0.22657723724842072, "learning_rate": 3.583724569640063e-06, "loss": 8.7361, "step": 410 }, { "epoch": 1.9260450160771705, "grad_norm": 0.21941417455673218, "learning_rate": 3.568075117370892e-06, "loss": 9.2343, "step": 411 }, { "epoch": 1.9307220111078633, "grad_norm": 0.18514755368232727, "learning_rate": 3.5524256651017218e-06, "loss": 8.2767, "step": 412 }, { "epoch": 1.935399006138556, "grad_norm": 0.13066066801548004, "learning_rate": 3.536776212832551e-06, "loss": 8.7371, "step": 413 }, { "epoch": 1.9400760011692486, "grad_norm": 0.16903606057167053, "learning_rate": 3.5211267605633804e-06, "loss": 9.3067, "step": 414 }, { "epoch": 1.9447529961999415, "grad_norm": 0.14286428689956665, "learning_rate": 3.50547730829421e-06, "loss": 7.8586, "step": 415 }, { "epoch": 1.9494299912306343, "grad_norm": 0.1969095915555954, "learning_rate": 3.4898278560250394e-06, "loss": 9.6053, "step": 416 }, { "epoch": 1.9541069862613272, "grad_norm": 0.1750202775001526, "learning_rate": 3.474178403755869e-06, "loss": 9.0714, "step": 417 }, { "epoch": 1.95878398129202, "grad_norm": 0.21293002367019653, "learning_rate": 3.458528951486698e-06, "loss": 9.8726, "step": 418 }, { "epoch": 1.9634609763227127, "grad_norm": 0.1672164648771286, "learning_rate": 3.4428794992175273e-06, "loss": 9.5275, "step": 419 }, { "epoch": 1.9681379713534053, "grad_norm": 0.17561869323253632, "learning_rate": 3.427230046948357e-06, "loss": 7.2097, "step": 420 }, { "epoch": 1.9728149663840981, "grad_norm": 0.16326965391635895, "learning_rate": 3.4115805946791864e-06, "loss": 9.3302, "step": 421 }, { "epoch": 1.977491961414791, "grad_norm": 0.15163388848304749, "learning_rate": 3.395931142410016e-06, "loss": 9.1933, "step": 422 }, { "epoch": 1.9821689564454839, "grad_norm": 0.16277414560317993, "learning_rate": 3.3802816901408454e-06, "loss": 8.3196, "step": 423 }, { "epoch": 1.9868459514761767, "grad_norm": 0.18385657668113708, "learning_rate": 3.364632237871675e-06, "loss": 8.1472, "step": 424 }, { "epoch": 1.9915229465068693, "grad_norm": 0.1768423169851303, "learning_rate": 3.348982785602504e-06, "loss": 8.3639, "step": 425 }, { "epoch": 1.996199941537562, "grad_norm": 0.2325451821088791, "learning_rate": 3.3333333333333333e-06, "loss": 9.265, "step": 426 }, { "epoch": 2.004676995030693, "grad_norm": 0.22825832664966583, "learning_rate": 3.317683881064163e-06, "loss": 10.0732, "step": 427 }, { "epoch": 2.0093539900613857, "grad_norm": 0.16034899652004242, "learning_rate": 3.3020344287949924e-06, "loss": 7.7232, "step": 428 }, { "epoch": 2.014030985092078, "grad_norm": 0.1737372726202011, "learning_rate": 3.286384976525822e-06, "loss": 8.0928, "step": 429 }, { "epoch": 2.018707980122771, "grad_norm": 0.20644846558570862, "learning_rate": 3.2707355242566514e-06, "loss": 8.6956, "step": 430 }, { "epoch": 2.023384975153464, "grad_norm": 0.3140431344509125, "learning_rate": 3.2550860719874807e-06, "loss": 9.0442, "step": 431 }, { "epoch": 2.0280619701841567, "grad_norm": 0.2457619458436966, "learning_rate": 3.2394366197183104e-06, "loss": 8.6256, "step": 432 }, { "epoch": 2.0327389652148495, "grad_norm": 0.2014688104391098, "learning_rate": 3.2237871674491393e-06, "loss": 9.7276, "step": 433 }, { "epoch": 2.0374159602455424, "grad_norm": 0.1970800757408142, "learning_rate": 3.208137715179969e-06, "loss": 8.19, "step": 434 }, { "epoch": 2.0420929552762352, "grad_norm": 0.12662629783153534, "learning_rate": 3.1924882629107983e-06, "loss": 9.235, "step": 435 }, { "epoch": 2.0467699503069277, "grad_norm": 0.15353932976722717, "learning_rate": 3.1768388106416277e-06, "loss": 8.8255, "step": 436 }, { "epoch": 2.0514469453376205, "grad_norm": 0.2180812507867813, "learning_rate": 3.1611893583724574e-06, "loss": 9.1142, "step": 437 }, { "epoch": 2.0561239403683134, "grad_norm": 0.18303510546684265, "learning_rate": 3.1455399061032867e-06, "loss": 10.1061, "step": 438 }, { "epoch": 2.060800935399006, "grad_norm": 0.15254124999046326, "learning_rate": 3.1298904538341164e-06, "loss": 8.5431, "step": 439 }, { "epoch": 2.065477930429699, "grad_norm": 0.16063688695430756, "learning_rate": 3.1142410015649453e-06, "loss": 8.4382, "step": 440 }, { "epoch": 2.0701549254603915, "grad_norm": 0.20583708584308624, "learning_rate": 3.0985915492957746e-06, "loss": 8.1778, "step": 441 }, { "epoch": 2.0748319204910843, "grad_norm": 0.11699045449495316, "learning_rate": 3.0829420970266043e-06, "loss": 7.8459, "step": 442 }, { "epoch": 2.079508915521777, "grad_norm": 0.1605014204978943, "learning_rate": 3.0672926447574336e-06, "loss": 8.4224, "step": 443 }, { "epoch": 2.08418591055247, "grad_norm": 0.14405608177185059, "learning_rate": 3.0516431924882634e-06, "loss": 8.3442, "step": 444 }, { "epoch": 2.088862905583163, "grad_norm": 0.17145852744579315, "learning_rate": 3.0359937402190927e-06, "loss": 8.7685, "step": 445 }, { "epoch": 2.0935399006138558, "grad_norm": 0.14711640775203705, "learning_rate": 3.0203442879499216e-06, "loss": 7.3568, "step": 446 }, { "epoch": 2.098216895644548, "grad_norm": 0.13734185695648193, "learning_rate": 3.0046948356807513e-06, "loss": 8.4425, "step": 447 }, { "epoch": 2.102893890675241, "grad_norm": 0.1571117639541626, "learning_rate": 2.9890453834115806e-06, "loss": 7.6952, "step": 448 }, { "epoch": 2.107570885705934, "grad_norm": 0.15319029986858368, "learning_rate": 2.9733959311424103e-06, "loss": 7.9937, "step": 449 }, { "epoch": 2.1122478807366267, "grad_norm": 0.18363691866397858, "learning_rate": 2.9577464788732396e-06, "loss": 8.4406, "step": 450 }, { "epoch": 2.1169248757673196, "grad_norm": 0.1433074176311493, "learning_rate": 2.9420970266040694e-06, "loss": 9.7219, "step": 451 }, { "epoch": 2.1216018707980124, "grad_norm": 0.14841365814208984, "learning_rate": 2.9264475743348987e-06, "loss": 8.2459, "step": 452 }, { "epoch": 2.126278865828705, "grad_norm": 0.18753403425216675, "learning_rate": 2.910798122065728e-06, "loss": 8.7057, "step": 453 }, { "epoch": 2.1309558608593977, "grad_norm": 0.1748085618019104, "learning_rate": 2.8951486697965577e-06, "loss": 8.5651, "step": 454 }, { "epoch": 2.1356328558900906, "grad_norm": 0.17874014377593994, "learning_rate": 2.8794992175273866e-06, "loss": 8.5838, "step": 455 }, { "epoch": 2.1403098509207834, "grad_norm": 0.16495150327682495, "learning_rate": 2.8638497652582163e-06, "loss": 9.8249, "step": 456 }, { "epoch": 2.1449868459514763, "grad_norm": 0.12347421795129776, "learning_rate": 2.8482003129890456e-06, "loss": 7.1875, "step": 457 }, { "epoch": 2.149663840982169, "grad_norm": 0.1617746353149414, "learning_rate": 2.832550860719875e-06, "loss": 7.7209, "step": 458 }, { "epoch": 2.154340836012862, "grad_norm": 0.160769984126091, "learning_rate": 2.8169014084507046e-06, "loss": 7.7851, "step": 459 }, { "epoch": 2.1590178310435544, "grad_norm": 0.14725424349308014, "learning_rate": 2.801251956181534e-06, "loss": 7.8194, "step": 460 }, { "epoch": 2.1636948260742472, "grad_norm": 0.11912764608860016, "learning_rate": 2.7856025039123637e-06, "loss": 7.7984, "step": 461 }, { "epoch": 2.16837182110494, "grad_norm": 0.17748208343982697, "learning_rate": 2.7699530516431926e-06, "loss": 8.0672, "step": 462 }, { "epoch": 2.173048816135633, "grad_norm": 0.1708259880542755, "learning_rate": 2.754303599374022e-06, "loss": 9.2099, "step": 463 }, { "epoch": 2.177725811166326, "grad_norm": 0.15187622606754303, "learning_rate": 2.7386541471048516e-06, "loss": 8.3165, "step": 464 }, { "epoch": 2.182402806197018, "grad_norm": 0.18263490498065948, "learning_rate": 2.723004694835681e-06, "loss": 9.9331, "step": 465 }, { "epoch": 2.187079801227711, "grad_norm": 0.12427602708339691, "learning_rate": 2.7073552425665106e-06, "loss": 8.5229, "step": 466 }, { "epoch": 2.191756796258404, "grad_norm": 0.13961510360240936, "learning_rate": 2.69170579029734e-06, "loss": 8.3661, "step": 467 }, { "epoch": 2.1964337912890968, "grad_norm": 0.14999401569366455, "learning_rate": 2.676056338028169e-06, "loss": 7.2095, "step": 468 }, { "epoch": 2.2011107863197896, "grad_norm": 0.14472222328186035, "learning_rate": 2.6604068857589986e-06, "loss": 8.6861, "step": 469 }, { "epoch": 2.2057877813504825, "grad_norm": 0.14089444279670715, "learning_rate": 2.644757433489828e-06, "loss": 7.8008, "step": 470 }, { "epoch": 2.210464776381175, "grad_norm": 0.13669133186340332, "learning_rate": 2.6291079812206576e-06, "loss": 8.5063, "step": 471 }, { "epoch": 2.2151417714118677, "grad_norm": 0.1669352799654007, "learning_rate": 2.613458528951487e-06, "loss": 8.7907, "step": 472 }, { "epoch": 2.2198187664425606, "grad_norm": 0.15821270644664764, "learning_rate": 2.597809076682316e-06, "loss": 8.0308, "step": 473 }, { "epoch": 2.2244957614732535, "grad_norm": 0.21483926475048065, "learning_rate": 2.582159624413146e-06, "loss": 8.502, "step": 474 }, { "epoch": 2.2291727565039463, "grad_norm": 0.18459928035736084, "learning_rate": 2.5665101721439752e-06, "loss": 9.5206, "step": 475 }, { "epoch": 2.233849751534639, "grad_norm": 0.1487099826335907, "learning_rate": 2.550860719874805e-06, "loss": 6.9168, "step": 476 }, { "epoch": 2.2385267465653316, "grad_norm": 0.2513448894023895, "learning_rate": 2.535211267605634e-06, "loss": 9.3783, "step": 477 }, { "epoch": 2.2432037415960244, "grad_norm": 0.1873185932636261, "learning_rate": 2.519561815336463e-06, "loss": 8.2886, "step": 478 }, { "epoch": 2.2478807366267173, "grad_norm": 0.19832056760787964, "learning_rate": 2.503912363067293e-06, "loss": 8.1959, "step": 479 }, { "epoch": 2.25255773165741, "grad_norm": 0.20701546967029572, "learning_rate": 2.488262910798122e-06, "loss": 8.1702, "step": 480 }, { "epoch": 2.257234726688103, "grad_norm": 0.12690390646457672, "learning_rate": 2.4726134585289515e-06, "loss": 8.7747, "step": 481 }, { "epoch": 2.261911721718796, "grad_norm": 0.1636572629213333, "learning_rate": 2.4569640062597812e-06, "loss": 7.8555, "step": 482 }, { "epoch": 2.2665887167494887, "grad_norm": 0.12632915377616882, "learning_rate": 2.4413145539906105e-06, "loss": 7.9758, "step": 483 }, { "epoch": 2.271265711780181, "grad_norm": 0.16761943697929382, "learning_rate": 2.42566510172144e-06, "loss": 8.0032, "step": 484 }, { "epoch": 2.275942706810874, "grad_norm": 0.15796944499015808, "learning_rate": 2.4100156494522696e-06, "loss": 8.8154, "step": 485 }, { "epoch": 2.280619701841567, "grad_norm": 0.16528886556625366, "learning_rate": 2.3943661971830984e-06, "loss": 7.3999, "step": 486 }, { "epoch": 2.2852966968722597, "grad_norm": 0.14766015112400055, "learning_rate": 2.378716744913928e-06, "loss": 7.8343, "step": 487 }, { "epoch": 2.2899736919029525, "grad_norm": 0.12624794244766235, "learning_rate": 2.3630672926447575e-06, "loss": 8.0017, "step": 488 }, { "epoch": 2.294650686933645, "grad_norm": 0.16594719886779785, "learning_rate": 2.347417840375587e-06, "loss": 7.7649, "step": 489 }, { "epoch": 2.299327681964338, "grad_norm": 0.1574728637933731, "learning_rate": 2.3317683881064165e-06, "loss": 9.2884, "step": 490 }, { "epoch": 2.3040046769950306, "grad_norm": 0.1298084557056427, "learning_rate": 2.316118935837246e-06, "loss": 8.4339, "step": 491 }, { "epoch": 2.3086816720257235, "grad_norm": 0.15643304586410522, "learning_rate": 2.300469483568075e-06, "loss": 8.0997, "step": 492 }, { "epoch": 2.3133586670564164, "grad_norm": 0.13263966143131256, "learning_rate": 2.284820031298905e-06, "loss": 8.109, "step": 493 }, { "epoch": 2.318035662087109, "grad_norm": 0.21980319917201996, "learning_rate": 2.269170579029734e-06, "loss": 8.2741, "step": 494 }, { "epoch": 2.322712657117802, "grad_norm": 0.13680629432201385, "learning_rate": 2.2535211267605635e-06, "loss": 8.5315, "step": 495 }, { "epoch": 2.3273896521484945, "grad_norm": 0.1529272496700287, "learning_rate": 2.237871674491393e-06, "loss": 8.0531, "step": 496 }, { "epoch": 2.3320666471791873, "grad_norm": 0.174594908952713, "learning_rate": 2.222222222222222e-06, "loss": 7.7507, "step": 497 }, { "epoch": 2.33674364220988, "grad_norm": 0.17085200548171997, "learning_rate": 2.206572769953052e-06, "loss": 7.1328, "step": 498 }, { "epoch": 2.341420637240573, "grad_norm": 0.14975635707378387, "learning_rate": 2.190923317683881e-06, "loss": 9.8064, "step": 499 }, { "epoch": 2.346097632271266, "grad_norm": 0.15309952199459076, "learning_rate": 2.175273865414711e-06, "loss": 8.6898, "step": 500 }, { "epoch": 2.3507746273019583, "grad_norm": 0.13084295392036438, "learning_rate": 2.15962441314554e-06, "loss": 8.1384, "step": 501 }, { "epoch": 2.355451622332651, "grad_norm": 0.16496095061302185, "learning_rate": 2.1439749608763695e-06, "loss": 8.9057, "step": 502 }, { "epoch": 2.360128617363344, "grad_norm": 0.157500758767128, "learning_rate": 2.1283255086071988e-06, "loss": 7.967, "step": 503 }, { "epoch": 2.364805612394037, "grad_norm": 0.1988188475370407, "learning_rate": 2.1126760563380285e-06, "loss": 7.4129, "step": 504 }, { "epoch": 2.3694826074247297, "grad_norm": 0.21104207634925842, "learning_rate": 2.097026604068858e-06, "loss": 7.5442, "step": 505 }, { "epoch": 2.3741596024554226, "grad_norm": 0.20285457372665405, "learning_rate": 2.081377151799687e-06, "loss": 7.3523, "step": 506 }, { "epoch": 2.378836597486115, "grad_norm": 0.24479469656944275, "learning_rate": 2.065727699530517e-06, "loss": 7.8577, "step": 507 }, { "epoch": 2.383513592516808, "grad_norm": 0.150054469704628, "learning_rate": 2.0500782472613457e-06, "loss": 8.2585, "step": 508 }, { "epoch": 2.3881905875475007, "grad_norm": 0.12602077424526215, "learning_rate": 2.0344287949921754e-06, "loss": 7.7554, "step": 509 }, { "epoch": 2.3928675825781935, "grad_norm": 0.18626457452774048, "learning_rate": 2.0187793427230047e-06, "loss": 7.5464, "step": 510 }, { "epoch": 2.3975445776088864, "grad_norm": 0.20931190252304077, "learning_rate": 2.0031298904538345e-06, "loss": 7.5159, "step": 511 }, { "epoch": 2.4022215726395793, "grad_norm": 0.2555796802043915, "learning_rate": 1.9874804381846638e-06, "loss": 7.5645, "step": 512 }, { "epoch": 2.4068985676702717, "grad_norm": 0.17398537695407867, "learning_rate": 1.971830985915493e-06, "loss": 8.211, "step": 513 }, { "epoch": 2.4115755627009645, "grad_norm": 0.19993047416210175, "learning_rate": 1.9561815336463224e-06, "loss": 8.3602, "step": 514 }, { "epoch": 2.4162525577316574, "grad_norm": 0.15980151295661926, "learning_rate": 1.940532081377152e-06, "loss": 7.6245, "step": 515 }, { "epoch": 2.4209295527623502, "grad_norm": 0.16947968304157257, "learning_rate": 1.9248826291079814e-06, "loss": 8.2847, "step": 516 }, { "epoch": 2.425606547793043, "grad_norm": 0.1670764982700348, "learning_rate": 1.9092331768388107e-06, "loss": 8.4169, "step": 517 }, { "epoch": 2.430283542823736, "grad_norm": 0.17053499817848206, "learning_rate": 1.8935837245696402e-06, "loss": 8.8886, "step": 518 }, { "epoch": 2.4349605378544283, "grad_norm": 0.16047680377960205, "learning_rate": 1.8779342723004696e-06, "loss": 7.9574, "step": 519 }, { "epoch": 2.439637532885121, "grad_norm": 0.2619805932044983, "learning_rate": 1.862284820031299e-06, "loss": 8.203, "step": 520 }, { "epoch": 2.444314527915814, "grad_norm": 0.2122809886932373, "learning_rate": 1.8466353677621286e-06, "loss": 7.8094, "step": 521 }, { "epoch": 2.448991522946507, "grad_norm": 0.15507692098617554, "learning_rate": 1.8309859154929579e-06, "loss": 7.7085, "step": 522 }, { "epoch": 2.4536685179771998, "grad_norm": 0.1406126171350479, "learning_rate": 1.8153364632237874e-06, "loss": 8.483, "step": 523 }, { "epoch": 2.4583455130078926, "grad_norm": 0.19436419010162354, "learning_rate": 1.7996870109546167e-06, "loss": 8.271, "step": 524 }, { "epoch": 2.463022508038585, "grad_norm": 0.17198602855205536, "learning_rate": 1.784037558685446e-06, "loss": 8.3665, "step": 525 }, { "epoch": 2.467699503069278, "grad_norm": 0.28165027499198914, "learning_rate": 1.7683881064162755e-06, "loss": 7.8636, "step": 526 }, { "epoch": 2.4723764980999707, "grad_norm": 0.2032092958688736, "learning_rate": 1.752738654147105e-06, "loss": 7.5732, "step": 527 }, { "epoch": 2.4770534931306636, "grad_norm": 0.13977749645709991, "learning_rate": 1.7370892018779346e-06, "loss": 7.2479, "step": 528 }, { "epoch": 2.4817304881613564, "grad_norm": 0.13071084022521973, "learning_rate": 1.7214397496087637e-06, "loss": 7.008, "step": 529 }, { "epoch": 2.4864074831920493, "grad_norm": 0.15741536021232605, "learning_rate": 1.7057902973395932e-06, "loss": 8.0612, "step": 530 }, { "epoch": 2.4910844782227417, "grad_norm": 0.16548508405685425, "learning_rate": 1.6901408450704227e-06, "loss": 8.0312, "step": 531 }, { "epoch": 2.4957614732534346, "grad_norm": 0.16299135982990265, "learning_rate": 1.674491392801252e-06, "loss": 8.8502, "step": 532 }, { "epoch": 2.5004384682841274, "grad_norm": 0.159685879945755, "learning_rate": 1.6588419405320815e-06, "loss": 9.5205, "step": 533 }, { "epoch": 2.5051154633148203, "grad_norm": 0.1804819405078888, "learning_rate": 1.643192488262911e-06, "loss": 7.683, "step": 534 }, { "epoch": 2.509792458345513, "grad_norm": 0.16809211671352386, "learning_rate": 1.6275430359937403e-06, "loss": 8.6418, "step": 535 }, { "epoch": 2.514469453376206, "grad_norm": 0.17984607815742493, "learning_rate": 1.6118935837245697e-06, "loss": 7.68, "step": 536 }, { "epoch": 2.5191464484068984, "grad_norm": 0.17649582028388977, "learning_rate": 1.5962441314553992e-06, "loss": 8.1753, "step": 537 }, { "epoch": 2.5238234434375912, "grad_norm": 0.16467247903347015, "learning_rate": 1.5805946791862287e-06, "loss": 7.6117, "step": 538 }, { "epoch": 2.528500438468284, "grad_norm": 0.17968781292438507, "learning_rate": 1.5649452269170582e-06, "loss": 8.549, "step": 539 }, { "epoch": 2.533177433498977, "grad_norm": 0.15423156321048737, "learning_rate": 1.5492957746478873e-06, "loss": 8.7104, "step": 540 }, { "epoch": 2.53785442852967, "grad_norm": 0.14077003300189972, "learning_rate": 1.5336463223787168e-06, "loss": 8.934, "step": 541 }, { "epoch": 2.542531423560362, "grad_norm": 0.16637051105499268, "learning_rate": 1.5179968701095463e-06, "loss": 7.4252, "step": 542 }, { "epoch": 2.5472084185910555, "grad_norm": 0.1724003106355667, "learning_rate": 1.5023474178403756e-06, "loss": 7.9955, "step": 543 }, { "epoch": 2.551885413621748, "grad_norm": 0.19609539210796356, "learning_rate": 1.4866979655712052e-06, "loss": 8.3348, "step": 544 }, { "epoch": 2.5565624086524408, "grad_norm": 0.12707825005054474, "learning_rate": 1.4710485133020347e-06, "loss": 8.0848, "step": 545 }, { "epoch": 2.5612394036831336, "grad_norm": 0.2031966894865036, "learning_rate": 1.455399061032864e-06, "loss": 9.8729, "step": 546 }, { "epoch": 2.5659163987138265, "grad_norm": 0.18515604734420776, "learning_rate": 1.4397496087636933e-06, "loss": 7.8293, "step": 547 }, { "epoch": 2.5705933937445193, "grad_norm": 0.15621398389339447, "learning_rate": 1.4241001564945228e-06, "loss": 9.3478, "step": 548 }, { "epoch": 2.5752703887752117, "grad_norm": 0.22210869193077087, "learning_rate": 1.4084507042253523e-06, "loss": 8.1303, "step": 549 }, { "epoch": 2.5799473838059046, "grad_norm": 0.27393949031829834, "learning_rate": 1.3928012519561818e-06, "loss": 8.4729, "step": 550 }, { "epoch": 2.5846243788365975, "grad_norm": 0.13042934238910675, "learning_rate": 1.377151799687011e-06, "loss": 8.2335, "step": 551 }, { "epoch": 2.5893013738672903, "grad_norm": 0.207389697432518, "learning_rate": 1.3615023474178405e-06, "loss": 8.0168, "step": 552 }, { "epoch": 2.593978368897983, "grad_norm": 0.14343053102493286, "learning_rate": 1.34585289514867e-06, "loss": 7.9552, "step": 553 }, { "epoch": 2.5986553639286756, "grad_norm": 0.1722148060798645, "learning_rate": 1.3302034428794993e-06, "loss": 7.6877, "step": 554 }, { "epoch": 2.603332358959369, "grad_norm": 0.18076814711093903, "learning_rate": 1.3145539906103288e-06, "loss": 8.0741, "step": 555 }, { "epoch": 2.6080093539900613, "grad_norm": 0.14633478224277496, "learning_rate": 1.298904538341158e-06, "loss": 7.3683, "step": 556 }, { "epoch": 2.612686349020754, "grad_norm": 0.14783795177936554, "learning_rate": 1.2832550860719876e-06, "loss": 7.8992, "step": 557 }, { "epoch": 2.617363344051447, "grad_norm": 0.15360093116760254, "learning_rate": 1.267605633802817e-06, "loss": 8.8425, "step": 558 }, { "epoch": 2.62204033908214, "grad_norm": 0.1691809445619583, "learning_rate": 1.2519561815336464e-06, "loss": 8.669, "step": 559 }, { "epoch": 2.6267173341128327, "grad_norm": 0.16426807641983032, "learning_rate": 1.2363067292644757e-06, "loss": 9.1265, "step": 560 }, { "epoch": 2.631394329143525, "grad_norm": 0.1331864446401596, "learning_rate": 1.2206572769953053e-06, "loss": 7.4892, "step": 561 }, { "epoch": 2.636071324174218, "grad_norm": 0.1330748349428177, "learning_rate": 1.2050078247261348e-06, "loss": 8.6181, "step": 562 }, { "epoch": 2.640748319204911, "grad_norm": 0.14942462742328644, "learning_rate": 1.189358372456964e-06, "loss": 7.8301, "step": 563 }, { "epoch": 2.6454253142356037, "grad_norm": 0.16964685916900635, "learning_rate": 1.1737089201877936e-06, "loss": 7.1293, "step": 564 }, { "epoch": 2.6501023092662965, "grad_norm": 0.1727379858493805, "learning_rate": 1.158059467918623e-06, "loss": 7.1773, "step": 565 }, { "epoch": 2.654779304296989, "grad_norm": 0.14950168132781982, "learning_rate": 1.1424100156494524e-06, "loss": 7.5172, "step": 566 }, { "epoch": 2.6594562993276822, "grad_norm": 0.16068300604820251, "learning_rate": 1.1267605633802817e-06, "loss": 8.7739, "step": 567 }, { "epoch": 2.6641332943583746, "grad_norm": 0.18006567656993866, "learning_rate": 1.111111111111111e-06, "loss": 8.2067, "step": 568 }, { "epoch": 2.6688102893890675, "grad_norm": 0.19861166179180145, "learning_rate": 1.0954616588419406e-06, "loss": 7.1208, "step": 569 }, { "epoch": 2.6734872844197604, "grad_norm": 0.13374726474285126, "learning_rate": 1.07981220657277e-06, "loss": 7.8631, "step": 570 }, { "epoch": 2.678164279450453, "grad_norm": 0.17814220488071442, "learning_rate": 1.0641627543035994e-06, "loss": 7.4765, "step": 571 }, { "epoch": 2.682841274481146, "grad_norm": 0.22474409639835358, "learning_rate": 1.048513302034429e-06, "loss": 7.0754, "step": 572 }, { "epoch": 2.6875182695118385, "grad_norm": 0.16655339300632477, "learning_rate": 1.0328638497652584e-06, "loss": 7.2505, "step": 573 }, { "epoch": 2.6921952645425313, "grad_norm": 0.172933891415596, "learning_rate": 1.0172143974960877e-06, "loss": 8.0832, "step": 574 }, { "epoch": 2.696872259573224, "grad_norm": 0.14097332954406738, "learning_rate": 1.0015649452269172e-06, "loss": 8.0197, "step": 575 }, { "epoch": 2.701549254603917, "grad_norm": 0.1363203376531601, "learning_rate": 9.859154929577465e-07, "loss": 7.2466, "step": 576 }, { "epoch": 2.70622624963461, "grad_norm": 0.17508287727832794, "learning_rate": 9.70266040688576e-07, "loss": 9.3567, "step": 577 }, { "epoch": 2.7109032446653023, "grad_norm": 0.169004425406456, "learning_rate": 9.546165884194054e-07, "loss": 8.0132, "step": 578 }, { "epoch": 2.715580239695995, "grad_norm": 0.14103683829307556, "learning_rate": 9.389671361502348e-07, "loss": 9.0192, "step": 579 }, { "epoch": 2.720257234726688, "grad_norm": 0.197422057390213, "learning_rate": 9.233176838810643e-07, "loss": 7.2779, "step": 580 }, { "epoch": 2.724934229757381, "grad_norm": 0.1950581669807434, "learning_rate": 9.076682316118937e-07, "loss": 7.6089, "step": 581 }, { "epoch": 2.7296112247880737, "grad_norm": 0.23691439628601074, "learning_rate": 8.92018779342723e-07, "loss": 7.9658, "step": 582 }, { "epoch": 2.7342882198187666, "grad_norm": 0.2558799684047699, "learning_rate": 8.763693270735525e-07, "loss": 8.4791, "step": 583 }, { "epoch": 2.7389652148494594, "grad_norm": 0.17010214924812317, "learning_rate": 8.607198748043818e-07, "loss": 8.4854, "step": 584 }, { "epoch": 2.743642209880152, "grad_norm": 0.13403132557868958, "learning_rate": 8.450704225352114e-07, "loss": 6.7996, "step": 585 }, { "epoch": 2.7483192049108447, "grad_norm": 0.14201347529888153, "learning_rate": 8.294209702660408e-07, "loss": 8.3937, "step": 586 }, { "epoch": 2.7529961999415375, "grad_norm": 0.28258565068244934, "learning_rate": 8.137715179968702e-07, "loss": 9.3709, "step": 587 }, { "epoch": 2.7576731949722304, "grad_norm": 0.17337313294410706, "learning_rate": 7.981220657276996e-07, "loss": 7.6751, "step": 588 }, { "epoch": 2.7623501900029233, "grad_norm": 0.1940070241689682, "learning_rate": 7.824726134585291e-07, "loss": 7.3925, "step": 589 }, { "epoch": 2.7670271850336157, "grad_norm": 0.14429809153079987, "learning_rate": 7.668231611893584e-07, "loss": 7.4402, "step": 590 }, { "epoch": 2.7717041800643085, "grad_norm": 0.17765949666500092, "learning_rate": 7.511737089201878e-07, "loss": 7.6954, "step": 591 }, { "epoch": 2.7763811750950014, "grad_norm": 0.15836399793624878, "learning_rate": 7.355242566510173e-07, "loss": 7.6582, "step": 592 }, { "epoch": 2.7810581701256942, "grad_norm": 0.14881688356399536, "learning_rate": 7.198748043818466e-07, "loss": 8.7099, "step": 593 }, { "epoch": 2.785735165156387, "grad_norm": 0.21536029875278473, "learning_rate": 7.042253521126762e-07, "loss": 8.3015, "step": 594 }, { "epoch": 2.79041216018708, "grad_norm": 0.14025098085403442, "learning_rate": 6.885758998435055e-07, "loss": 8.7512, "step": 595 }, { "epoch": 2.795089155217773, "grad_norm": 0.13290052115917206, "learning_rate": 6.72926447574335e-07, "loss": 7.5792, "step": 596 }, { "epoch": 2.799766150248465, "grad_norm": 0.3149656057357788, "learning_rate": 6.572769953051644e-07, "loss": 8.4982, "step": 597 }, { "epoch": 2.804443145279158, "grad_norm": 0.16543497145175934, "learning_rate": 6.416275430359938e-07, "loss": 8.3097, "step": 598 }, { "epoch": 2.809120140309851, "grad_norm": 0.17708784341812134, "learning_rate": 6.259780907668232e-07, "loss": 8.8047, "step": 599 }, { "epoch": 2.8137971353405438, "grad_norm": 0.14560888707637787, "learning_rate": 6.103286384976526e-07, "loss": 8.0925, "step": 600 }, { "epoch": 2.8184741303712366, "grad_norm": 0.1902446448802948, "learning_rate": 5.94679186228482e-07, "loss": 7.7629, "step": 601 }, { "epoch": 2.823151125401929, "grad_norm": 0.1473388820886612, "learning_rate": 5.790297339593115e-07, "loss": 8.1077, "step": 602 }, { "epoch": 2.827828120432622, "grad_norm": 0.16258402168750763, "learning_rate": 5.633802816901409e-07, "loss": 7.8582, "step": 603 }, { "epoch": 2.8325051154633147, "grad_norm": 0.1769980639219284, "learning_rate": 5.477308294209703e-07, "loss": 7.1382, "step": 604 }, { "epoch": 2.8371821104940076, "grad_norm": 0.1444021314382553, "learning_rate": 5.320813771517997e-07, "loss": 7.1383, "step": 605 }, { "epoch": 2.8418591055247004, "grad_norm": 0.21616753935813904, "learning_rate": 5.164319248826292e-07, "loss": 7.605, "step": 606 }, { "epoch": 2.8465361005553933, "grad_norm": 0.20384635031223297, "learning_rate": 5.007824726134586e-07, "loss": 8.8131, "step": 607 }, { "epoch": 2.851213095586086, "grad_norm": 0.1579245626926422, "learning_rate": 4.85133020344288e-07, "loss": 7.8475, "step": 608 }, { "epoch": 2.8558900906167786, "grad_norm": 0.20689930021762848, "learning_rate": 4.694835680751174e-07, "loss": 7.1765, "step": 609 }, { "epoch": 2.8605670856474714, "grad_norm": 0.1589430868625641, "learning_rate": 4.5383411580594685e-07, "loss": 8.8879, "step": 610 }, { "epoch": 2.8652440806781643, "grad_norm": 0.16509409248828888, "learning_rate": 4.3818466353677626e-07, "loss": 7.6589, "step": 611 }, { "epoch": 2.869921075708857, "grad_norm": 0.1439896821975708, "learning_rate": 4.225352112676057e-07, "loss": 8.0296, "step": 612 }, { "epoch": 2.87459807073955, "grad_norm": 0.19501394033432007, "learning_rate": 4.068857589984351e-07, "loss": 8.5349, "step": 613 }, { "epoch": 2.8792750657702424, "grad_norm": 0.18828211724758148, "learning_rate": 3.9123630672926455e-07, "loss": 7.3593, "step": 614 }, { "epoch": 2.8839520608009352, "grad_norm": 0.15072734653949738, "learning_rate": 3.755868544600939e-07, "loss": 8.2512, "step": 615 }, { "epoch": 2.888629055831628, "grad_norm": 0.1598856896162033, "learning_rate": 3.599374021909233e-07, "loss": 7.3246, "step": 616 }, { "epoch": 2.893306050862321, "grad_norm": 0.15382905304431915, "learning_rate": 3.4428794992175273e-07, "loss": 7.674, "step": 617 }, { "epoch": 2.897983045893014, "grad_norm": 0.13851745426654816, "learning_rate": 3.286384976525822e-07, "loss": 7.6664, "step": 618 }, { "epoch": 2.9026600409237067, "grad_norm": 0.12572415173053741, "learning_rate": 3.129890453834116e-07, "loss": 8.6634, "step": 619 }, { "epoch": 2.9073370359543995, "grad_norm": 0.181121364235878, "learning_rate": 2.97339593114241e-07, "loss": 7.5301, "step": 620 }, { "epoch": 2.912014030985092, "grad_norm": 0.18877944350242615, "learning_rate": 2.8169014084507043e-07, "loss": 7.4353, "step": 621 }, { "epoch": 2.916691026015785, "grad_norm": 0.1800297349691391, "learning_rate": 2.6604068857589984e-07, "loss": 8.0261, "step": 622 }, { "epoch": 2.9213680210464776, "grad_norm": 0.1459706425666809, "learning_rate": 2.503912363067293e-07, "loss": 8.5898, "step": 623 }, { "epoch": 2.9260450160771705, "grad_norm": 0.19272330403327942, "learning_rate": 2.347417840375587e-07, "loss": 7.5655, "step": 624 }, { "epoch": 2.9307220111078633, "grad_norm": 0.13995127379894257, "learning_rate": 2.1909233176838813e-07, "loss": 8.807, "step": 625 }, { "epoch": 2.9353990061385558, "grad_norm": 0.19578878581523895, "learning_rate": 2.0344287949921754e-07, "loss": 8.2283, "step": 626 }, { "epoch": 2.9400760011692486, "grad_norm": 0.18744409084320068, "learning_rate": 1.8779342723004696e-07, "loss": 7.8586, "step": 627 }, { "epoch": 2.9447529961999415, "grad_norm": 0.18906202912330627, "learning_rate": 1.7214397496087637e-07, "loss": 8.9175, "step": 628 }, { "epoch": 2.9494299912306343, "grad_norm": 0.2817856967449188, "learning_rate": 1.564945226917058e-07, "loss": 7.8464, "step": 629 }, { "epoch": 2.954106986261327, "grad_norm": 0.1482636034488678, "learning_rate": 1.4084507042253522e-07, "loss": 8.0478, "step": 630 }, { "epoch": 2.95878398129202, "grad_norm": 0.1729574054479599, "learning_rate": 1.2519561815336465e-07, "loss": 8.3514, "step": 631 }, { "epoch": 2.963460976322713, "grad_norm": 0.23052264750003815, "learning_rate": 1.0954616588419407e-07, "loss": 8.397, "step": 632 }, { "epoch": 2.9681379713534053, "grad_norm": 0.16747911274433136, "learning_rate": 9.389671361502348e-08, "loss": 8.0443, "step": 633 }, { "epoch": 2.972814966384098, "grad_norm": 0.14860796928405762, "learning_rate": 7.82472613458529e-08, "loss": 7.317, "step": 634 }, { "epoch": 2.977491961414791, "grad_norm": 0.13674141466617584, "learning_rate": 6.259780907668233e-08, "loss": 7.6038, "step": 635 }, { "epoch": 2.982168956445484, "grad_norm": 0.163039892911911, "learning_rate": 4.694835680751174e-08, "loss": 7.4459, "step": 636 }, { "epoch": 2.9868459514761767, "grad_norm": 0.1598978042602539, "learning_rate": 3.1298904538341164e-08, "loss": 9.9334, "step": 637 }, { "epoch": 2.991522946506869, "grad_norm": 0.16937094926834106, "learning_rate": 1.5649452269170582e-08, "loss": 7.9632, "step": 638 }, { "epoch": 2.996199941537562, "grad_norm": 0.11614558100700378, "learning_rate": 0.0, "loss": 7.3632, "step": 639 }, { "epoch": 2.996199941537562, "step": 639, "total_flos": 2.8450474856619704e+18, "train_loss": 9.811768141524146, "train_runtime": 60574.6064, "train_samples_per_second": 1.355, "train_steps_per_second": 0.011 } ], "logging_steps": 1.0, "max_steps": 639, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8450474856619704e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }