{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1401, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021413276231263384, "grad_norm": 57.48312185632039, "learning_rate": 3.546099290780142e-07, "loss": 11.0536, "step": 1 }, { "epoch": 0.004282655246252677, "grad_norm": 56.04665199470871, "learning_rate": 7.092198581560284e-07, "loss": 11.108, "step": 2 }, { "epoch": 0.006423982869379015, "grad_norm": 57.69353550561418, "learning_rate": 1.0638297872340427e-06, "loss": 11.118, "step": 3 }, { "epoch": 0.008565310492505354, "grad_norm": 57.098115231260124, "learning_rate": 1.4184397163120568e-06, "loss": 11.0464, "step": 4 }, { "epoch": 0.010706638115631691, "grad_norm": 58.67499851336777, "learning_rate": 1.7730496453900712e-06, "loss": 10.9625, "step": 5 }, { "epoch": 0.01284796573875803, "grad_norm": 58.414650577956415, "learning_rate": 2.1276595744680853e-06, "loss": 10.9616, "step": 6 }, { "epoch": 0.014989293361884369, "grad_norm": 64.47207593228225, "learning_rate": 2.4822695035460995e-06, "loss": 10.6887, "step": 7 }, { "epoch": 0.017130620985010708, "grad_norm": 64.98938276175704, "learning_rate": 2.8368794326241136e-06, "loss": 10.6541, "step": 8 }, { "epoch": 0.019271948608137045, "grad_norm": 96.47909270749008, "learning_rate": 3.1914893617021277e-06, "loss": 9.209, "step": 9 }, { "epoch": 0.021413276231263382, "grad_norm": 111.61956792060187, "learning_rate": 3.5460992907801423e-06, "loss": 8.6722, "step": 10 }, { "epoch": 0.023554603854389723, "grad_norm": 66.79651103290082, "learning_rate": 3.9007092198581565e-06, "loss": 3.7251, "step": 11 }, { "epoch": 0.02569593147751606, "grad_norm": 56.39679177509825, "learning_rate": 4.255319148936171e-06, "loss": 3.2238, "step": 12 }, { "epoch": 0.027837259100642397, "grad_norm": 76.3878582360963, "learning_rate": 4.609929078014184e-06, "loss": 2.5959, "step": 13 }, { "epoch": 0.029978586723768737, "grad_norm": 31.950352388440905, "learning_rate": 4.964539007092199e-06, "loss": 2.3433, "step": 14 }, { "epoch": 0.032119914346895075, "grad_norm": 6.951146815684312, "learning_rate": 5.319148936170213e-06, "loss": 1.4451, "step": 15 }, { "epoch": 0.034261241970021415, "grad_norm": 4.818010610834237, "learning_rate": 5.673758865248227e-06, "loss": 1.294, "step": 16 }, { "epoch": 0.03640256959314775, "grad_norm": 3.6480447594140033, "learning_rate": 6.028368794326241e-06, "loss": 1.256, "step": 17 }, { "epoch": 0.03854389721627409, "grad_norm": 2.720810833620355, "learning_rate": 6.3829787234042555e-06, "loss": 1.1491, "step": 18 }, { "epoch": 0.04068522483940043, "grad_norm": 2.19419575616829, "learning_rate": 6.73758865248227e-06, "loss": 1.1041, "step": 19 }, { "epoch": 0.042826552462526764, "grad_norm": 1.5787733821950154, "learning_rate": 7.092198581560285e-06, "loss": 0.9964, "step": 20 }, { "epoch": 0.044967880085653104, "grad_norm": 6.086448551226785, "learning_rate": 7.446808510638298e-06, "loss": 0.9374, "step": 21 }, { "epoch": 0.047109207708779445, "grad_norm": 1.508606587040093, "learning_rate": 7.801418439716313e-06, "loss": 0.9276, "step": 22 }, { "epoch": 0.04925053533190578, "grad_norm": 1.0854043801940743, "learning_rate": 8.156028368794328e-06, "loss": 0.8555, "step": 23 }, { "epoch": 0.05139186295503212, "grad_norm": 0.9443347844565788, "learning_rate": 8.510638297872341e-06, "loss": 0.8648, "step": 24 }, { "epoch": 0.05353319057815846, "grad_norm": 0.8257150817721549, "learning_rate": 8.865248226950355e-06, "loss": 0.8433, "step": 25 }, { "epoch": 0.055674518201284794, "grad_norm": 0.7611313593085876, "learning_rate": 9.219858156028368e-06, "loss": 0.7927, "step": 26 }, { "epoch": 0.057815845824411134, "grad_norm": 0.9175929091877931, "learning_rate": 9.574468085106383e-06, "loss": 0.7758, "step": 27 }, { "epoch": 0.059957173447537475, "grad_norm": 0.6622602227048174, "learning_rate": 9.929078014184398e-06, "loss": 0.724, "step": 28 }, { "epoch": 0.06209850107066381, "grad_norm": 0.673388358753726, "learning_rate": 1.0283687943262411e-05, "loss": 0.7647, "step": 29 }, { "epoch": 0.06423982869379015, "grad_norm": 0.6658973555598119, "learning_rate": 1.0638297872340426e-05, "loss": 0.7529, "step": 30 }, { "epoch": 0.06638115631691649, "grad_norm": 0.7382952955966445, "learning_rate": 1.0992907801418441e-05, "loss": 0.7324, "step": 31 }, { "epoch": 0.06852248394004283, "grad_norm": 0.6870316045369611, "learning_rate": 1.1347517730496454e-05, "loss": 0.7258, "step": 32 }, { "epoch": 0.07066381156316917, "grad_norm": 0.5409451023446604, "learning_rate": 1.170212765957447e-05, "loss": 0.6995, "step": 33 }, { "epoch": 0.0728051391862955, "grad_norm": 0.4718286692000531, "learning_rate": 1.2056737588652483e-05, "loss": 0.6937, "step": 34 }, { "epoch": 0.07494646680942184, "grad_norm": 0.5560095137091816, "learning_rate": 1.2411347517730498e-05, "loss": 0.6849, "step": 35 }, { "epoch": 0.07708779443254818, "grad_norm": 0.5519607067166765, "learning_rate": 1.2765957446808511e-05, "loss": 0.6615, "step": 36 }, { "epoch": 0.07922912205567452, "grad_norm": 0.4650149562699858, "learning_rate": 1.3120567375886524e-05, "loss": 0.6624, "step": 37 }, { "epoch": 0.08137044967880086, "grad_norm": 0.4222769877707214, "learning_rate": 1.347517730496454e-05, "loss": 0.6429, "step": 38 }, { "epoch": 0.0835117773019272, "grad_norm": 0.41323190280074, "learning_rate": 1.3829787234042554e-05, "loss": 0.6267, "step": 39 }, { "epoch": 0.08565310492505353, "grad_norm": 0.45734022776906186, "learning_rate": 1.418439716312057e-05, "loss": 0.6516, "step": 40 }, { "epoch": 0.08779443254817987, "grad_norm": 0.42800824849390673, "learning_rate": 1.4539007092198581e-05, "loss": 0.6654, "step": 41 }, { "epoch": 0.08993576017130621, "grad_norm": 0.33157687702952515, "learning_rate": 1.4893617021276596e-05, "loss": 0.6061, "step": 42 }, { "epoch": 0.09207708779443255, "grad_norm": 0.34740038476735874, "learning_rate": 1.5248226950354611e-05, "loss": 0.6166, "step": 43 }, { "epoch": 0.09421841541755889, "grad_norm": 0.34140646940139696, "learning_rate": 1.5602836879432626e-05, "loss": 0.6102, "step": 44 }, { "epoch": 0.09635974304068523, "grad_norm": 0.3292213866559836, "learning_rate": 1.595744680851064e-05, "loss": 0.6227, "step": 45 }, { "epoch": 0.09850107066381156, "grad_norm": 0.30198419319828534, "learning_rate": 1.6312056737588656e-05, "loss": 0.6107, "step": 46 }, { "epoch": 0.1006423982869379, "grad_norm": 0.31063047872860655, "learning_rate": 1.6666666666666667e-05, "loss": 0.6188, "step": 47 }, { "epoch": 0.10278372591006424, "grad_norm": 0.30350185379074784, "learning_rate": 1.7021276595744682e-05, "loss": 0.5828, "step": 48 }, { "epoch": 0.10492505353319058, "grad_norm": 0.33592357937468176, "learning_rate": 1.7375886524822697e-05, "loss": 0.6074, "step": 49 }, { "epoch": 0.10706638115631692, "grad_norm": 0.2977049583100526, "learning_rate": 1.773049645390071e-05, "loss": 0.5956, "step": 50 }, { "epoch": 0.10920770877944326, "grad_norm": 0.29548881007123645, "learning_rate": 1.8085106382978724e-05, "loss": 0.5875, "step": 51 }, { "epoch": 0.11134903640256959, "grad_norm": 0.3157281891885196, "learning_rate": 1.8439716312056736e-05, "loss": 0.5875, "step": 52 }, { "epoch": 0.11349036402569593, "grad_norm": 0.2866517435357405, "learning_rate": 1.879432624113475e-05, "loss": 0.5827, "step": 53 }, { "epoch": 0.11563169164882227, "grad_norm": 0.2849703271418174, "learning_rate": 1.9148936170212766e-05, "loss": 0.5903, "step": 54 }, { "epoch": 0.11777301927194861, "grad_norm": 0.2883104724475601, "learning_rate": 1.950354609929078e-05, "loss": 0.566, "step": 55 }, { "epoch": 0.11991434689507495, "grad_norm": 0.27848766799340263, "learning_rate": 1.9858156028368796e-05, "loss": 0.6028, "step": 56 }, { "epoch": 0.12205567451820129, "grad_norm": 0.27054084886113955, "learning_rate": 2.0212765957446807e-05, "loss": 0.5738, "step": 57 }, { "epoch": 0.12419700214132762, "grad_norm": 0.28790406613678704, "learning_rate": 2.0567375886524822e-05, "loss": 0.5835, "step": 58 }, { "epoch": 0.12633832976445397, "grad_norm": 0.2609950876468609, "learning_rate": 2.0921985815602837e-05, "loss": 0.5679, "step": 59 }, { "epoch": 0.1284796573875803, "grad_norm": 0.22708508929335028, "learning_rate": 2.1276595744680852e-05, "loss": 0.5644, "step": 60 }, { "epoch": 0.13062098501070663, "grad_norm": 0.29273734573536586, "learning_rate": 2.1631205673758867e-05, "loss": 0.5506, "step": 61 }, { "epoch": 0.13276231263383298, "grad_norm": 0.25373390652035177, "learning_rate": 2.1985815602836882e-05, "loss": 0.5574, "step": 62 }, { "epoch": 0.1349036402569593, "grad_norm": 0.24510687261458605, "learning_rate": 2.2340425531914894e-05, "loss": 0.5977, "step": 63 }, { "epoch": 0.13704496788008566, "grad_norm": 0.24945740897429344, "learning_rate": 2.269503546099291e-05, "loss": 0.55, "step": 64 }, { "epoch": 0.139186295503212, "grad_norm": 0.2366238459163537, "learning_rate": 2.3049645390070924e-05, "loss": 0.5645, "step": 65 }, { "epoch": 0.14132762312633834, "grad_norm": 0.2671806828673314, "learning_rate": 2.340425531914894e-05, "loss": 0.5575, "step": 66 }, { "epoch": 0.14346895074946467, "grad_norm": 0.2693204831683393, "learning_rate": 2.3758865248226954e-05, "loss": 0.5644, "step": 67 }, { "epoch": 0.145610278372591, "grad_norm": 0.27274356761854357, "learning_rate": 2.4113475177304965e-05, "loss": 0.569, "step": 68 }, { "epoch": 0.14775160599571735, "grad_norm": 0.2834119787499386, "learning_rate": 2.446808510638298e-05, "loss": 0.5489, "step": 69 }, { "epoch": 0.14989293361884368, "grad_norm": 0.28557474380682984, "learning_rate": 2.4822695035460995e-05, "loss": 0.5516, "step": 70 }, { "epoch": 0.15203426124197003, "grad_norm": 0.28955467692470377, "learning_rate": 2.5177304964539007e-05, "loss": 0.5489, "step": 71 }, { "epoch": 0.15417558886509636, "grad_norm": 0.28586765508193035, "learning_rate": 2.5531914893617022e-05, "loss": 0.5531, "step": 72 }, { "epoch": 0.15631691648822268, "grad_norm": 0.26463443393592084, "learning_rate": 2.5886524822695034e-05, "loss": 0.5524, "step": 73 }, { "epoch": 0.15845824411134904, "grad_norm": 0.27995499634374477, "learning_rate": 2.624113475177305e-05, "loss": 0.569, "step": 74 }, { "epoch": 0.16059957173447537, "grad_norm": 0.263684223704656, "learning_rate": 2.6595744680851064e-05, "loss": 0.5287, "step": 75 }, { "epoch": 0.16274089935760172, "grad_norm": 0.2919043059115593, "learning_rate": 2.695035460992908e-05, "loss": 0.5541, "step": 76 }, { "epoch": 0.16488222698072805, "grad_norm": 0.2805459610716478, "learning_rate": 2.7304964539007094e-05, "loss": 0.5328, "step": 77 }, { "epoch": 0.1670235546038544, "grad_norm": 0.2704831415044607, "learning_rate": 2.765957446808511e-05, "loss": 0.5506, "step": 78 }, { "epoch": 0.16916488222698073, "grad_norm": 0.26497156122554943, "learning_rate": 2.8014184397163124e-05, "loss": 0.5383, "step": 79 }, { "epoch": 0.17130620985010706, "grad_norm": 0.26473030453104546, "learning_rate": 2.836879432624114e-05, "loss": 0.5278, "step": 80 }, { "epoch": 0.1734475374732334, "grad_norm": 0.26833166887012466, "learning_rate": 2.8723404255319154e-05, "loss": 0.543, "step": 81 }, { "epoch": 0.17558886509635974, "grad_norm": 0.2669805636364855, "learning_rate": 2.9078014184397162e-05, "loss": 0.5282, "step": 82 }, { "epoch": 0.1777301927194861, "grad_norm": 0.3149827479787504, "learning_rate": 2.9432624113475177e-05, "loss": 0.538, "step": 83 }, { "epoch": 0.17987152034261242, "grad_norm": 0.27006814672073254, "learning_rate": 2.9787234042553192e-05, "loss": 0.5468, "step": 84 }, { "epoch": 0.18201284796573874, "grad_norm": 0.2526636958624785, "learning_rate": 3.0141843971631207e-05, "loss": 0.515, "step": 85 }, { "epoch": 0.1841541755888651, "grad_norm": 0.3066822441172635, "learning_rate": 3.0496453900709222e-05, "loss": 0.5363, "step": 86 }, { "epoch": 0.18629550321199143, "grad_norm": 0.24592969798097147, "learning_rate": 3.085106382978723e-05, "loss": 0.5172, "step": 87 }, { "epoch": 0.18843683083511778, "grad_norm": 0.26341288237609334, "learning_rate": 3.120567375886525e-05, "loss": 0.5406, "step": 88 }, { "epoch": 0.1905781584582441, "grad_norm": 0.29569701405093984, "learning_rate": 3.156028368794326e-05, "loss": 0.5286, "step": 89 }, { "epoch": 0.19271948608137046, "grad_norm": 0.26720546438232595, "learning_rate": 3.191489361702128e-05, "loss": 0.5191, "step": 90 }, { "epoch": 0.1948608137044968, "grad_norm": 0.2861008299317861, "learning_rate": 3.226950354609929e-05, "loss": 0.5292, "step": 91 }, { "epoch": 0.19700214132762311, "grad_norm": 0.3127809337444568, "learning_rate": 3.262411347517731e-05, "loss": 0.5379, "step": 92 }, { "epoch": 0.19914346895074947, "grad_norm": 0.2584764301110668, "learning_rate": 3.2978723404255317e-05, "loss": 0.5066, "step": 93 }, { "epoch": 0.2012847965738758, "grad_norm": 0.2780596173019734, "learning_rate": 3.3333333333333335e-05, "loss": 0.5286, "step": 94 }, { "epoch": 0.20342612419700215, "grad_norm": 0.3273661550437294, "learning_rate": 3.3687943262411347e-05, "loss": 0.5155, "step": 95 }, { "epoch": 0.20556745182012848, "grad_norm": 0.3036241503506882, "learning_rate": 3.4042553191489365e-05, "loss": 0.5282, "step": 96 }, { "epoch": 0.20770877944325483, "grad_norm": 0.275255132741623, "learning_rate": 3.4397163120567377e-05, "loss": 0.5018, "step": 97 }, { "epoch": 0.20985010706638116, "grad_norm": 0.28093670646107644, "learning_rate": 3.4751773049645395e-05, "loss": 0.4945, "step": 98 }, { "epoch": 0.21199143468950749, "grad_norm": 0.27959022964420993, "learning_rate": 3.5106382978723407e-05, "loss": 0.5293, "step": 99 }, { "epoch": 0.21413276231263384, "grad_norm": 0.2987862837213484, "learning_rate": 3.546099290780142e-05, "loss": 0.5184, "step": 100 }, { "epoch": 0.21627408993576017, "grad_norm": 0.2856449444610249, "learning_rate": 3.5815602836879437e-05, "loss": 0.5226, "step": 101 }, { "epoch": 0.21841541755888652, "grad_norm": 0.2968896630112509, "learning_rate": 3.617021276595745e-05, "loss": 0.5314, "step": 102 }, { "epoch": 0.22055674518201285, "grad_norm": 0.30586641123740416, "learning_rate": 3.6524822695035466e-05, "loss": 0.5195, "step": 103 }, { "epoch": 0.22269807280513917, "grad_norm": 0.28057509983972945, "learning_rate": 3.687943262411347e-05, "loss": 0.5287, "step": 104 }, { "epoch": 0.22483940042826553, "grad_norm": 0.2752404260641325, "learning_rate": 3.723404255319149e-05, "loss": 0.525, "step": 105 }, { "epoch": 0.22698072805139186, "grad_norm": 0.28309242710377003, "learning_rate": 3.75886524822695e-05, "loss": 0.5178, "step": 106 }, { "epoch": 0.2291220556745182, "grad_norm": 0.2986830233079882, "learning_rate": 3.794326241134752e-05, "loss": 0.5338, "step": 107 }, { "epoch": 0.23126338329764454, "grad_norm": 0.27191322829490466, "learning_rate": 3.829787234042553e-05, "loss": 0.5064, "step": 108 }, { "epoch": 0.2334047109207709, "grad_norm": 0.3080545126102214, "learning_rate": 3.865248226950355e-05, "loss": 0.515, "step": 109 }, { "epoch": 0.23554603854389722, "grad_norm": 0.31855156829294484, "learning_rate": 3.900709219858156e-05, "loss": 0.5267, "step": 110 }, { "epoch": 0.23768736616702354, "grad_norm": 0.3118610221875637, "learning_rate": 3.936170212765958e-05, "loss": 0.4993, "step": 111 }, { "epoch": 0.2398286937901499, "grad_norm": 0.3563490136500225, "learning_rate": 3.971631205673759e-05, "loss": 0.494, "step": 112 }, { "epoch": 0.24197002141327623, "grad_norm": 0.33491792784430063, "learning_rate": 4.007092198581561e-05, "loss": 0.5166, "step": 113 }, { "epoch": 0.24411134903640258, "grad_norm": 0.3107234877761307, "learning_rate": 4.0425531914893614e-05, "loss": 0.5016, "step": 114 }, { "epoch": 0.2462526766595289, "grad_norm": 0.3574327616039191, "learning_rate": 4.078014184397163e-05, "loss": 0.4997, "step": 115 }, { "epoch": 0.24839400428265523, "grad_norm": 0.31250608779187283, "learning_rate": 4.1134751773049644e-05, "loss": 0.5012, "step": 116 }, { "epoch": 0.2505353319057816, "grad_norm": 0.31466096005113264, "learning_rate": 4.148936170212766e-05, "loss": 0.4898, "step": 117 }, { "epoch": 0.25267665952890794, "grad_norm": 0.35023750633522854, "learning_rate": 4.1843971631205674e-05, "loss": 0.5151, "step": 118 }, { "epoch": 0.25481798715203424, "grad_norm": 0.3055990330623537, "learning_rate": 4.219858156028369e-05, "loss": 0.4976, "step": 119 }, { "epoch": 0.2569593147751606, "grad_norm": 0.3558935694444545, "learning_rate": 4.2553191489361704e-05, "loss": 0.517, "step": 120 }, { "epoch": 0.25910064239828695, "grad_norm": 0.34915884790434, "learning_rate": 4.2907801418439716e-05, "loss": 0.5044, "step": 121 }, { "epoch": 0.26124197002141325, "grad_norm": 0.348632342188165, "learning_rate": 4.3262411347517734e-05, "loss": 0.505, "step": 122 }, { "epoch": 0.2633832976445396, "grad_norm": 0.3785527848062378, "learning_rate": 4.3617021276595746e-05, "loss": 0.5085, "step": 123 }, { "epoch": 0.26552462526766596, "grad_norm": 0.37162644733897265, "learning_rate": 4.3971631205673764e-05, "loss": 0.4833, "step": 124 }, { "epoch": 0.2676659528907923, "grad_norm": 0.3980723109877404, "learning_rate": 4.432624113475177e-05, "loss": 0.4955, "step": 125 }, { "epoch": 0.2698072805139186, "grad_norm": 0.3725438131171889, "learning_rate": 4.468085106382979e-05, "loss": 0.4879, "step": 126 }, { "epoch": 0.27194860813704497, "grad_norm": 0.3546366551258052, "learning_rate": 4.50354609929078e-05, "loss": 0.485, "step": 127 }, { "epoch": 0.2740899357601713, "grad_norm": 0.3352604060765713, "learning_rate": 4.539007092198582e-05, "loss": 0.5125, "step": 128 }, { "epoch": 0.2762312633832976, "grad_norm": 0.38902168152696476, "learning_rate": 4.574468085106383e-05, "loss": 0.4923, "step": 129 }, { "epoch": 0.278372591006424, "grad_norm": 0.4722633165183755, "learning_rate": 4.609929078014185e-05, "loss": 0.4933, "step": 130 }, { "epoch": 0.28051391862955033, "grad_norm": 0.40974642024145574, "learning_rate": 4.645390070921986e-05, "loss": 0.4823, "step": 131 }, { "epoch": 0.2826552462526767, "grad_norm": 0.34213507876160526, "learning_rate": 4.680851063829788e-05, "loss": 0.4865, "step": 132 }, { "epoch": 0.284796573875803, "grad_norm": 0.3961185410637184, "learning_rate": 4.716312056737589e-05, "loss": 0.4865, "step": 133 }, { "epoch": 0.28693790149892934, "grad_norm": 0.37000080466083957, "learning_rate": 4.751773049645391e-05, "loss": 0.4916, "step": 134 }, { "epoch": 0.2890792291220557, "grad_norm": 0.37414319777051985, "learning_rate": 4.787234042553192e-05, "loss": 0.5044, "step": 135 }, { "epoch": 0.291220556745182, "grad_norm": 0.3164774177894217, "learning_rate": 4.822695035460993e-05, "loss": 0.4811, "step": 136 }, { "epoch": 0.29336188436830835, "grad_norm": 0.47067620082781225, "learning_rate": 4.858156028368794e-05, "loss": 0.4945, "step": 137 }, { "epoch": 0.2955032119914347, "grad_norm": 0.4294475664764798, "learning_rate": 4.893617021276596e-05, "loss": 0.5094, "step": 138 }, { "epoch": 0.29764453961456105, "grad_norm": 0.3040028099316697, "learning_rate": 4.929078014184397e-05, "loss": 0.4942, "step": 139 }, { "epoch": 0.29978586723768735, "grad_norm": 0.37748057667062174, "learning_rate": 4.964539007092199e-05, "loss": 0.4905, "step": 140 }, { "epoch": 0.3019271948608137, "grad_norm": 0.2989419262617704, "learning_rate": 5e-05, "loss": 0.4905, "step": 141 }, { "epoch": 0.30406852248394006, "grad_norm": 0.3871243075644112, "learning_rate": 4.996031746031746e-05, "loss": 0.4852, "step": 142 }, { "epoch": 0.30620985010706636, "grad_norm": 0.29602921563211776, "learning_rate": 4.9920634920634924e-05, "loss": 0.4803, "step": 143 }, { "epoch": 0.3083511777301927, "grad_norm": 0.48126987646952557, "learning_rate": 4.9880952380952385e-05, "loss": 0.4963, "step": 144 }, { "epoch": 0.31049250535331907, "grad_norm": 0.3271921917836504, "learning_rate": 4.9841269841269845e-05, "loss": 0.4739, "step": 145 }, { "epoch": 0.31263383297644537, "grad_norm": 0.30962361825644874, "learning_rate": 4.9801587301587306e-05, "loss": 0.4659, "step": 146 }, { "epoch": 0.3147751605995717, "grad_norm": 0.2862717141303497, "learning_rate": 4.976190476190477e-05, "loss": 0.4805, "step": 147 }, { "epoch": 0.3169164882226981, "grad_norm": 0.29165213355742037, "learning_rate": 4.972222222222223e-05, "loss": 0.4893, "step": 148 }, { "epoch": 0.31905781584582443, "grad_norm": 0.2837190882523078, "learning_rate": 4.968253968253969e-05, "loss": 0.4877, "step": 149 }, { "epoch": 0.32119914346895073, "grad_norm": 0.3162679943468195, "learning_rate": 4.964285714285715e-05, "loss": 0.4949, "step": 150 }, { "epoch": 0.3233404710920771, "grad_norm": 0.3166915931837597, "learning_rate": 4.960317460317461e-05, "loss": 0.4745, "step": 151 }, { "epoch": 0.32548179871520344, "grad_norm": 0.2867036124644211, "learning_rate": 4.956349206349207e-05, "loss": 0.4827, "step": 152 }, { "epoch": 0.32762312633832974, "grad_norm": 0.3127471586658351, "learning_rate": 4.9523809523809525e-05, "loss": 0.4817, "step": 153 }, { "epoch": 0.3297644539614561, "grad_norm": 0.28863042358499497, "learning_rate": 4.9484126984126985e-05, "loss": 0.4944, "step": 154 }, { "epoch": 0.33190578158458245, "grad_norm": 0.30635024454971044, "learning_rate": 4.9444444444444446e-05, "loss": 0.4882, "step": 155 }, { "epoch": 0.3340471092077088, "grad_norm": 0.25406638929643754, "learning_rate": 4.940476190476191e-05, "loss": 0.4583, "step": 156 }, { "epoch": 0.3361884368308351, "grad_norm": 0.4236621872965728, "learning_rate": 4.936507936507937e-05, "loss": 0.5011, "step": 157 }, { "epoch": 0.33832976445396146, "grad_norm": 0.33240414924657075, "learning_rate": 4.932539682539683e-05, "loss": 0.4984, "step": 158 }, { "epoch": 0.3404710920770878, "grad_norm": 0.27992076972800195, "learning_rate": 4.928571428571429e-05, "loss": 0.4882, "step": 159 }, { "epoch": 0.3426124197002141, "grad_norm": 0.3378573050357272, "learning_rate": 4.924603174603175e-05, "loss": 0.4821, "step": 160 }, { "epoch": 0.34475374732334046, "grad_norm": 0.3116481418117616, "learning_rate": 4.9206349206349204e-05, "loss": 0.501, "step": 161 }, { "epoch": 0.3468950749464668, "grad_norm": 0.30756617491799604, "learning_rate": 4.9166666666666665e-05, "loss": 0.499, "step": 162 }, { "epoch": 0.3490364025695932, "grad_norm": 0.3501139975577791, "learning_rate": 4.9126984126984125e-05, "loss": 0.4998, "step": 163 }, { "epoch": 0.3511777301927195, "grad_norm": 0.30095198583101096, "learning_rate": 4.9087301587301586e-05, "loss": 0.4772, "step": 164 }, { "epoch": 0.3533190578158458, "grad_norm": 0.29146076836363266, "learning_rate": 4.904761904761905e-05, "loss": 0.4932, "step": 165 }, { "epoch": 0.3554603854389722, "grad_norm": 0.3245937860525494, "learning_rate": 4.900793650793651e-05, "loss": 0.4717, "step": 166 }, { "epoch": 0.3576017130620985, "grad_norm": 0.27276729093930885, "learning_rate": 4.896825396825397e-05, "loss": 0.485, "step": 167 }, { "epoch": 0.35974304068522484, "grad_norm": 0.36020649417508827, "learning_rate": 4.892857142857143e-05, "loss": 0.4873, "step": 168 }, { "epoch": 0.3618843683083512, "grad_norm": 0.2918070400385983, "learning_rate": 4.888888888888889e-05, "loss": 0.4905, "step": 169 }, { "epoch": 0.3640256959314775, "grad_norm": 0.33570775396857644, "learning_rate": 4.884920634920635e-05, "loss": 0.4852, "step": 170 }, { "epoch": 0.36616702355460384, "grad_norm": 0.3134384968531535, "learning_rate": 4.880952380952381e-05, "loss": 0.477, "step": 171 }, { "epoch": 0.3683083511777302, "grad_norm": 0.31510450944034885, "learning_rate": 4.876984126984127e-05, "loss": 0.4851, "step": 172 }, { "epoch": 0.37044967880085655, "grad_norm": 0.3154584471333162, "learning_rate": 4.873015873015873e-05, "loss": 0.4974, "step": 173 }, { "epoch": 0.37259100642398285, "grad_norm": 0.2909626760317359, "learning_rate": 4.8690476190476194e-05, "loss": 0.4837, "step": 174 }, { "epoch": 0.3747323340471092, "grad_norm": 0.3647746614109793, "learning_rate": 4.8650793650793654e-05, "loss": 0.4766, "step": 175 }, { "epoch": 0.37687366167023556, "grad_norm": 0.2994221837641762, "learning_rate": 4.8611111111111115e-05, "loss": 0.4889, "step": 176 }, { "epoch": 0.37901498929336186, "grad_norm": 0.3352657823786708, "learning_rate": 4.8571428571428576e-05, "loss": 0.5019, "step": 177 }, { "epoch": 0.3811563169164882, "grad_norm": 0.3916874320487166, "learning_rate": 4.853174603174604e-05, "loss": 0.5092, "step": 178 }, { "epoch": 0.38329764453961457, "grad_norm": 0.29941263170579285, "learning_rate": 4.84920634920635e-05, "loss": 0.4803, "step": 179 }, { "epoch": 0.3854389721627409, "grad_norm": 0.32217426505709545, "learning_rate": 4.845238095238095e-05, "loss": 0.4984, "step": 180 }, { "epoch": 0.3875802997858672, "grad_norm": 0.3032060935319803, "learning_rate": 4.841269841269841e-05, "loss": 0.4673, "step": 181 }, { "epoch": 0.3897216274089936, "grad_norm": 0.2726502791688811, "learning_rate": 4.837301587301587e-05, "loss": 0.4664, "step": 182 }, { "epoch": 0.39186295503211993, "grad_norm": 0.2971786378502053, "learning_rate": 4.8333333333333334e-05, "loss": 0.4798, "step": 183 }, { "epoch": 0.39400428265524623, "grad_norm": 0.2865455200314379, "learning_rate": 4.8293650793650794e-05, "loss": 0.4775, "step": 184 }, { "epoch": 0.3961456102783726, "grad_norm": 0.2547381179115949, "learning_rate": 4.8253968253968255e-05, "loss": 0.473, "step": 185 }, { "epoch": 0.39828693790149894, "grad_norm": 0.2970502284932418, "learning_rate": 4.8214285714285716e-05, "loss": 0.4777, "step": 186 }, { "epoch": 0.4004282655246253, "grad_norm": 0.3081755326163523, "learning_rate": 4.817460317460318e-05, "loss": 0.4884, "step": 187 }, { "epoch": 0.4025695931477516, "grad_norm": 0.2988953377132881, "learning_rate": 4.813492063492064e-05, "loss": 0.4721, "step": 188 }, { "epoch": 0.40471092077087795, "grad_norm": 0.32840372938982254, "learning_rate": 4.80952380952381e-05, "loss": 0.4592, "step": 189 }, { "epoch": 0.4068522483940043, "grad_norm": 0.27671818316763175, "learning_rate": 4.805555555555556e-05, "loss": 0.4712, "step": 190 }, { "epoch": 0.4089935760171306, "grad_norm": 0.3246160797192649, "learning_rate": 4.801587301587302e-05, "loss": 0.4752, "step": 191 }, { "epoch": 0.41113490364025695, "grad_norm": 0.3566765772926943, "learning_rate": 4.797619047619048e-05, "loss": 0.4672, "step": 192 }, { "epoch": 0.4132762312633833, "grad_norm": 0.26828922214902995, "learning_rate": 4.793650793650794e-05, "loss": 0.4878, "step": 193 }, { "epoch": 0.41541755888650966, "grad_norm": 0.3550040895109434, "learning_rate": 4.78968253968254e-05, "loss": 0.48, "step": 194 }, { "epoch": 0.41755888650963596, "grad_norm": 0.2838693253149512, "learning_rate": 4.785714285714286e-05, "loss": 0.4611, "step": 195 }, { "epoch": 0.4197002141327623, "grad_norm": 0.39397907047263425, "learning_rate": 4.781746031746032e-05, "loss": 0.4655, "step": 196 }, { "epoch": 0.42184154175588867, "grad_norm": 0.26305224728667986, "learning_rate": 4.7777777777777784e-05, "loss": 0.4841, "step": 197 }, { "epoch": 0.42398286937901497, "grad_norm": 0.325701418774634, "learning_rate": 4.7738095238095245e-05, "loss": 0.4603, "step": 198 }, { "epoch": 0.4261241970021413, "grad_norm": 0.442210465125467, "learning_rate": 4.7698412698412706e-05, "loss": 0.4599, "step": 199 }, { "epoch": 0.4282655246252677, "grad_norm": 0.30449934938572704, "learning_rate": 4.7658730158730166e-05, "loss": 0.4715, "step": 200 }, { "epoch": 0.430406852248394, "grad_norm": 0.30205346239976966, "learning_rate": 4.761904761904762e-05, "loss": 0.4832, "step": 201 }, { "epoch": 0.43254817987152033, "grad_norm": 0.29056082365428426, "learning_rate": 4.757936507936508e-05, "loss": 0.489, "step": 202 }, { "epoch": 0.4346895074946467, "grad_norm": 0.36617244189716913, "learning_rate": 4.753968253968254e-05, "loss": 0.4716, "step": 203 }, { "epoch": 0.43683083511777304, "grad_norm": 0.2589569319503199, "learning_rate": 4.75e-05, "loss": 0.4533, "step": 204 }, { "epoch": 0.43897216274089934, "grad_norm": 0.3693939526209848, "learning_rate": 4.746031746031746e-05, "loss": 0.477, "step": 205 }, { "epoch": 0.4411134903640257, "grad_norm": 0.32847584920064604, "learning_rate": 4.7420634920634924e-05, "loss": 0.4928, "step": 206 }, { "epoch": 0.44325481798715205, "grad_norm": 0.44099566221142933, "learning_rate": 4.738095238095238e-05, "loss": 0.4556, "step": 207 }, { "epoch": 0.44539614561027835, "grad_norm": 0.2472596663917096, "learning_rate": 4.734126984126984e-05, "loss": 0.4589, "step": 208 }, { "epoch": 0.4475374732334047, "grad_norm": 0.4006991143128524, "learning_rate": 4.73015873015873e-05, "loss": 0.469, "step": 209 }, { "epoch": 0.44967880085653106, "grad_norm": 0.2851927359912657, "learning_rate": 4.726190476190476e-05, "loss": 0.4766, "step": 210 }, { "epoch": 0.4518201284796574, "grad_norm": 0.3365161124957976, "learning_rate": 4.722222222222222e-05, "loss": 0.4723, "step": 211 }, { "epoch": 0.4539614561027837, "grad_norm": 0.34565226910942354, "learning_rate": 4.718253968253968e-05, "loss": 0.4758, "step": 212 }, { "epoch": 0.45610278372591007, "grad_norm": 0.27344513413865157, "learning_rate": 4.714285714285714e-05, "loss": 0.4679, "step": 213 }, { "epoch": 0.4582441113490364, "grad_norm": 0.38311587773024736, "learning_rate": 4.71031746031746e-05, "loss": 0.48, "step": 214 }, { "epoch": 0.4603854389721627, "grad_norm": 0.34364682832024224, "learning_rate": 4.7063492063492064e-05, "loss": 0.4782, "step": 215 }, { "epoch": 0.4625267665952891, "grad_norm": 0.3914058243306043, "learning_rate": 4.7023809523809525e-05, "loss": 0.4661, "step": 216 }, { "epoch": 0.46466809421841543, "grad_norm": 0.29870287943266227, "learning_rate": 4.6984126984126986e-05, "loss": 0.4638, "step": 217 }, { "epoch": 0.4668094218415418, "grad_norm": 0.4094058032238448, "learning_rate": 4.6944444444444446e-05, "loss": 0.46, "step": 218 }, { "epoch": 0.4689507494646681, "grad_norm": 0.3350045536223048, "learning_rate": 4.690476190476191e-05, "loss": 0.4679, "step": 219 }, { "epoch": 0.47109207708779444, "grad_norm": 0.3512288511765975, "learning_rate": 4.686507936507937e-05, "loss": 0.4685, "step": 220 }, { "epoch": 0.4732334047109208, "grad_norm": 0.3378187173840975, "learning_rate": 4.682539682539683e-05, "loss": 0.4465, "step": 221 }, { "epoch": 0.4753747323340471, "grad_norm": 0.3038250915138398, "learning_rate": 4.678571428571429e-05, "loss": 0.4821, "step": 222 }, { "epoch": 0.47751605995717344, "grad_norm": 0.29703824475925095, "learning_rate": 4.674603174603175e-05, "loss": 0.4747, "step": 223 }, { "epoch": 0.4796573875802998, "grad_norm": 0.3806097625906433, "learning_rate": 4.670634920634921e-05, "loss": 0.4979, "step": 224 }, { "epoch": 0.4817987152034261, "grad_norm": 0.3317321869836366, "learning_rate": 4.666666666666667e-05, "loss": 0.4775, "step": 225 }, { "epoch": 0.48394004282655245, "grad_norm": 0.35267177499930097, "learning_rate": 4.662698412698413e-05, "loss": 0.4707, "step": 226 }, { "epoch": 0.4860813704496788, "grad_norm": 0.3134789773559103, "learning_rate": 4.658730158730159e-05, "loss": 0.4676, "step": 227 }, { "epoch": 0.48822269807280516, "grad_norm": 0.31826459466880935, "learning_rate": 4.6547619047619054e-05, "loss": 0.4741, "step": 228 }, { "epoch": 0.49036402569593146, "grad_norm": 0.3172595965266178, "learning_rate": 4.6507936507936515e-05, "loss": 0.4769, "step": 229 }, { "epoch": 0.4925053533190578, "grad_norm": 0.281104120201888, "learning_rate": 4.646825396825397e-05, "loss": 0.4486, "step": 230 }, { "epoch": 0.49464668094218417, "grad_norm": 0.3390660082734012, "learning_rate": 4.642857142857143e-05, "loss": 0.4721, "step": 231 }, { "epoch": 0.49678800856531047, "grad_norm": 0.2624873773296077, "learning_rate": 4.638888888888889e-05, "loss": 0.4579, "step": 232 }, { "epoch": 0.4989293361884368, "grad_norm": 0.30482318678415316, "learning_rate": 4.634920634920635e-05, "loss": 0.4575, "step": 233 }, { "epoch": 0.5010706638115632, "grad_norm": 0.3163232948976339, "learning_rate": 4.630952380952381e-05, "loss": 0.4771, "step": 234 }, { "epoch": 0.5032119914346895, "grad_norm": 0.28403210542670865, "learning_rate": 4.626984126984127e-05, "loss": 0.4571, "step": 235 }, { "epoch": 0.5053533190578159, "grad_norm": 0.3169956103989183, "learning_rate": 4.623015873015873e-05, "loss": 0.4642, "step": 236 }, { "epoch": 0.5074946466809421, "grad_norm": 0.3124625210824864, "learning_rate": 4.6190476190476194e-05, "loss": 0.4934, "step": 237 }, { "epoch": 0.5096359743040685, "grad_norm": 0.27914854479065176, "learning_rate": 4.6150793650793655e-05, "loss": 0.4777, "step": 238 }, { "epoch": 0.5117773019271948, "grad_norm": 0.30405223359375916, "learning_rate": 4.6111111111111115e-05, "loss": 0.4774, "step": 239 }, { "epoch": 0.5139186295503212, "grad_norm": 0.27824494271256855, "learning_rate": 4.607142857142857e-05, "loss": 0.4778, "step": 240 }, { "epoch": 0.5160599571734475, "grad_norm": 0.28631142513329905, "learning_rate": 4.603174603174603e-05, "loss": 0.4742, "step": 241 }, { "epoch": 0.5182012847965739, "grad_norm": 0.2734339260745776, "learning_rate": 4.599206349206349e-05, "loss": 0.4598, "step": 242 }, { "epoch": 0.5203426124197003, "grad_norm": 0.4662830157775148, "learning_rate": 4.595238095238095e-05, "loss": 0.4622, "step": 243 }, { "epoch": 0.5224839400428265, "grad_norm": 0.27437983872972277, "learning_rate": 4.591269841269841e-05, "loss": 0.4661, "step": 244 }, { "epoch": 0.5246252676659529, "grad_norm": 0.3569225244178123, "learning_rate": 4.587301587301587e-05, "loss": 0.4558, "step": 245 }, { "epoch": 0.5267665952890792, "grad_norm": 0.2921312413477028, "learning_rate": 4.5833333333333334e-05, "loss": 0.473, "step": 246 }, { "epoch": 0.5289079229122056, "grad_norm": 0.3105118836185397, "learning_rate": 4.5793650793650795e-05, "loss": 0.4592, "step": 247 }, { "epoch": 0.5310492505353319, "grad_norm": 0.2602252376592866, "learning_rate": 4.5753968253968255e-05, "loss": 0.4475, "step": 248 }, { "epoch": 0.5331905781584583, "grad_norm": 0.31546458571249575, "learning_rate": 4.5714285714285716e-05, "loss": 0.4655, "step": 249 }, { "epoch": 0.5353319057815846, "grad_norm": 0.29234521833457866, "learning_rate": 4.567460317460318e-05, "loss": 0.4386, "step": 250 }, { "epoch": 0.5374732334047109, "grad_norm": 0.27089799714474905, "learning_rate": 4.563492063492064e-05, "loss": 0.4552, "step": 251 }, { "epoch": 0.5396145610278372, "grad_norm": 0.2876790516942422, "learning_rate": 4.55952380952381e-05, "loss": 0.4643, "step": 252 }, { "epoch": 0.5417558886509636, "grad_norm": 0.2796690158138871, "learning_rate": 4.555555555555556e-05, "loss": 0.4685, "step": 253 }, { "epoch": 0.5438972162740899, "grad_norm": 0.2620205830889585, "learning_rate": 4.551587301587302e-05, "loss": 0.4628, "step": 254 }, { "epoch": 0.5460385438972163, "grad_norm": 0.27675126452812215, "learning_rate": 4.547619047619048e-05, "loss": 0.4659, "step": 255 }, { "epoch": 0.5481798715203426, "grad_norm": 0.2529727200895915, "learning_rate": 4.543650793650794e-05, "loss": 0.4719, "step": 256 }, { "epoch": 0.550321199143469, "grad_norm": 0.28938734392771215, "learning_rate": 4.5396825396825395e-05, "loss": 0.4499, "step": 257 }, { "epoch": 0.5524625267665952, "grad_norm": 0.2732820003277474, "learning_rate": 4.5357142857142856e-05, "loss": 0.4677, "step": 258 }, { "epoch": 0.5546038543897216, "grad_norm": 0.24867294850473456, "learning_rate": 4.531746031746032e-05, "loss": 0.4714, "step": 259 }, { "epoch": 0.556745182012848, "grad_norm": 0.3040098909557791, "learning_rate": 4.527777777777778e-05, "loss": 0.4757, "step": 260 }, { "epoch": 0.5588865096359743, "grad_norm": 0.2705701549546398, "learning_rate": 4.523809523809524e-05, "loss": 0.4642, "step": 261 }, { "epoch": 0.5610278372591007, "grad_norm": 0.2949114446979747, "learning_rate": 4.51984126984127e-05, "loss": 0.4671, "step": 262 }, { "epoch": 0.563169164882227, "grad_norm": 0.28678659699639003, "learning_rate": 4.515873015873016e-05, "loss": 0.4519, "step": 263 }, { "epoch": 0.5653104925053534, "grad_norm": 0.2582358880028455, "learning_rate": 4.511904761904762e-05, "loss": 0.4545, "step": 264 }, { "epoch": 0.5674518201284796, "grad_norm": 0.304233138313808, "learning_rate": 4.507936507936508e-05, "loss": 0.4616, "step": 265 }, { "epoch": 0.569593147751606, "grad_norm": 0.2721420459823916, "learning_rate": 4.503968253968254e-05, "loss": 0.4547, "step": 266 }, { "epoch": 0.5717344753747323, "grad_norm": 0.2687992677978425, "learning_rate": 4.5e-05, "loss": 0.4576, "step": 267 }, { "epoch": 0.5738758029978587, "grad_norm": 0.2742900950884715, "learning_rate": 4.4960317460317464e-05, "loss": 0.4459, "step": 268 }, { "epoch": 0.576017130620985, "grad_norm": 0.28795217687814245, "learning_rate": 4.4920634920634924e-05, "loss": 0.4698, "step": 269 }, { "epoch": 0.5781584582441114, "grad_norm": 0.25641275034601446, "learning_rate": 4.4880952380952385e-05, "loss": 0.4679, "step": 270 }, { "epoch": 0.5802997858672377, "grad_norm": 0.26668890447386795, "learning_rate": 4.4841269841269846e-05, "loss": 0.463, "step": 271 }, { "epoch": 0.582441113490364, "grad_norm": 0.24030260367205755, "learning_rate": 4.4801587301587307e-05, "loss": 0.4628, "step": 272 }, { "epoch": 0.5845824411134903, "grad_norm": 0.2790517674146759, "learning_rate": 4.476190476190477e-05, "loss": 0.4671, "step": 273 }, { "epoch": 0.5867237687366167, "grad_norm": 0.22820456884707455, "learning_rate": 4.472222222222223e-05, "loss": 0.4457, "step": 274 }, { "epoch": 0.588865096359743, "grad_norm": 0.3271552248804396, "learning_rate": 4.468253968253969e-05, "loss": 0.473, "step": 275 }, { "epoch": 0.5910064239828694, "grad_norm": 0.2862040648114735, "learning_rate": 4.464285714285715e-05, "loss": 0.4675, "step": 276 }, { "epoch": 0.5931477516059958, "grad_norm": 0.3114706333448285, "learning_rate": 4.460317460317461e-05, "loss": 0.4666, "step": 277 }, { "epoch": 0.5952890792291221, "grad_norm": 0.32309306607336397, "learning_rate": 4.456349206349207e-05, "loss": 0.463, "step": 278 }, { "epoch": 0.5974304068522484, "grad_norm": 0.24906075204446051, "learning_rate": 4.4523809523809525e-05, "loss": 0.4588, "step": 279 }, { "epoch": 0.5995717344753747, "grad_norm": 0.3269798068381555, "learning_rate": 4.4484126984126986e-05, "loss": 0.4434, "step": 280 }, { "epoch": 0.6017130620985011, "grad_norm": 0.24388996621753112, "learning_rate": 4.4444444444444447e-05, "loss": 0.4773, "step": 281 }, { "epoch": 0.6038543897216274, "grad_norm": 0.3069850683178239, "learning_rate": 4.440476190476191e-05, "loss": 0.4517, "step": 282 }, { "epoch": 0.6059957173447538, "grad_norm": 0.249901094708518, "learning_rate": 4.436507936507937e-05, "loss": 0.4426, "step": 283 }, { "epoch": 0.6081370449678801, "grad_norm": 0.2678810481940219, "learning_rate": 4.432539682539683e-05, "loss": 0.4446, "step": 284 }, { "epoch": 0.6102783725910065, "grad_norm": 0.26626008865151485, "learning_rate": 4.428571428571428e-05, "loss": 0.444, "step": 285 }, { "epoch": 0.6124197002141327, "grad_norm": 0.25802913659501, "learning_rate": 4.4246031746031744e-05, "loss": 0.4514, "step": 286 }, { "epoch": 0.6145610278372591, "grad_norm": 0.3058519427291137, "learning_rate": 4.4206349206349204e-05, "loss": 0.4657, "step": 287 }, { "epoch": 0.6167023554603854, "grad_norm": 0.25739722317495545, "learning_rate": 4.4166666666666665e-05, "loss": 0.4565, "step": 288 }, { "epoch": 0.6188436830835118, "grad_norm": 0.2846833831079424, "learning_rate": 4.4126984126984126e-05, "loss": 0.4584, "step": 289 }, { "epoch": 0.6209850107066381, "grad_norm": 0.258608736893351, "learning_rate": 4.4087301587301587e-05, "loss": 0.4335, "step": 290 }, { "epoch": 0.6231263383297645, "grad_norm": 0.2911602326047305, "learning_rate": 4.404761904761905e-05, "loss": 0.4394, "step": 291 }, { "epoch": 0.6252676659528907, "grad_norm": 0.26424730063896607, "learning_rate": 4.400793650793651e-05, "loss": 0.4739, "step": 292 }, { "epoch": 0.6274089935760171, "grad_norm": 0.27052936749041473, "learning_rate": 4.396825396825397e-05, "loss": 0.4737, "step": 293 }, { "epoch": 0.6295503211991434, "grad_norm": 0.2826287053464395, "learning_rate": 4.392857142857143e-05, "loss": 0.4557, "step": 294 }, { "epoch": 0.6316916488222698, "grad_norm": 0.22012742826317977, "learning_rate": 4.388888888888889e-05, "loss": 0.4408, "step": 295 }, { "epoch": 0.6338329764453962, "grad_norm": 0.28162001988825747, "learning_rate": 4.384920634920635e-05, "loss": 0.4574, "step": 296 }, { "epoch": 0.6359743040685225, "grad_norm": 0.23283539296950778, "learning_rate": 4.380952380952381e-05, "loss": 0.4462, "step": 297 }, { "epoch": 0.6381156316916489, "grad_norm": 0.2498862871583954, "learning_rate": 4.376984126984127e-05, "loss": 0.439, "step": 298 }, { "epoch": 0.6402569593147751, "grad_norm": 0.30936218743365784, "learning_rate": 4.373015873015873e-05, "loss": 0.4405, "step": 299 }, { "epoch": 0.6423982869379015, "grad_norm": 0.22890543657069876, "learning_rate": 4.3690476190476194e-05, "loss": 0.4337, "step": 300 }, { "epoch": 0.6445396145610278, "grad_norm": 0.3478614587683788, "learning_rate": 4.3650793650793655e-05, "loss": 0.4596, "step": 301 }, { "epoch": 0.6466809421841542, "grad_norm": 0.2696278385429742, "learning_rate": 4.3611111111111116e-05, "loss": 0.4713, "step": 302 }, { "epoch": 0.6488222698072805, "grad_norm": 0.3139962716399602, "learning_rate": 4.3571428571428576e-05, "loss": 0.4527, "step": 303 }, { "epoch": 0.6509635974304069, "grad_norm": 0.2670285875512765, "learning_rate": 4.353174603174604e-05, "loss": 0.4627, "step": 304 }, { "epoch": 0.6531049250535332, "grad_norm": 0.3044721685500663, "learning_rate": 4.34920634920635e-05, "loss": 0.4785, "step": 305 }, { "epoch": 0.6552462526766595, "grad_norm": 0.27396062258454984, "learning_rate": 4.345238095238096e-05, "loss": 0.4523, "step": 306 }, { "epoch": 0.6573875802997858, "grad_norm": 0.2886960728161468, "learning_rate": 4.341269841269842e-05, "loss": 0.4517, "step": 307 }, { "epoch": 0.6595289079229122, "grad_norm": 0.2533722996661738, "learning_rate": 4.337301587301587e-05, "loss": 0.4646, "step": 308 }, { "epoch": 0.6616702355460385, "grad_norm": 0.2699468559182033, "learning_rate": 4.3333333333333334e-05, "loss": 0.4421, "step": 309 }, { "epoch": 0.6638115631691649, "grad_norm": 0.283041429141369, "learning_rate": 4.3293650793650795e-05, "loss": 0.4552, "step": 310 }, { "epoch": 0.6659528907922913, "grad_norm": 0.2591706001178728, "learning_rate": 4.3253968253968256e-05, "loss": 0.4527, "step": 311 }, { "epoch": 0.6680942184154176, "grad_norm": 0.2864467963151813, "learning_rate": 4.3214285714285716e-05, "loss": 0.4666, "step": 312 }, { "epoch": 0.6702355460385439, "grad_norm": 0.281657408994935, "learning_rate": 4.317460317460318e-05, "loss": 0.4582, "step": 313 }, { "epoch": 0.6723768736616702, "grad_norm": 0.2474591816739494, "learning_rate": 4.313492063492064e-05, "loss": 0.4361, "step": 314 }, { "epoch": 0.6745182012847966, "grad_norm": 0.2973486888734723, "learning_rate": 4.30952380952381e-05, "loss": 0.4583, "step": 315 }, { "epoch": 0.6766595289079229, "grad_norm": 0.2444964189560587, "learning_rate": 4.305555555555556e-05, "loss": 0.4664, "step": 316 }, { "epoch": 0.6788008565310493, "grad_norm": 0.2480078169039145, "learning_rate": 4.301587301587302e-05, "loss": 0.4675, "step": 317 }, { "epoch": 0.6809421841541756, "grad_norm": 0.26312534234905105, "learning_rate": 4.297619047619048e-05, "loss": 0.4489, "step": 318 }, { "epoch": 0.683083511777302, "grad_norm": 0.2462656315297276, "learning_rate": 4.2936507936507935e-05, "loss": 0.4591, "step": 319 }, { "epoch": 0.6852248394004282, "grad_norm": 0.29816624349859283, "learning_rate": 4.2896825396825396e-05, "loss": 0.4477, "step": 320 }, { "epoch": 0.6873661670235546, "grad_norm": 0.24698029570776403, "learning_rate": 4.2857142857142856e-05, "loss": 0.4695, "step": 321 }, { "epoch": 0.6895074946466809, "grad_norm": 0.34703431482300096, "learning_rate": 4.281746031746032e-05, "loss": 0.4566, "step": 322 }, { "epoch": 0.6916488222698073, "grad_norm": 0.2437786811285962, "learning_rate": 4.277777777777778e-05, "loss": 0.4723, "step": 323 }, { "epoch": 0.6937901498929336, "grad_norm": 0.3412185273042048, "learning_rate": 4.273809523809524e-05, "loss": 0.4532, "step": 324 }, { "epoch": 0.69593147751606, "grad_norm": 0.24904956038384604, "learning_rate": 4.26984126984127e-05, "loss": 0.4334, "step": 325 }, { "epoch": 0.6980728051391863, "grad_norm": 0.2782237575432937, "learning_rate": 4.265873015873016e-05, "loss": 0.4463, "step": 326 }, { "epoch": 0.7002141327623126, "grad_norm": 0.2657363496164857, "learning_rate": 4.261904761904762e-05, "loss": 0.4591, "step": 327 }, { "epoch": 0.702355460385439, "grad_norm": 0.2555708376181943, "learning_rate": 4.257936507936508e-05, "loss": 0.4464, "step": 328 }, { "epoch": 0.7044967880085653, "grad_norm": 0.23571687740169944, "learning_rate": 4.253968253968254e-05, "loss": 0.4455, "step": 329 }, { "epoch": 0.7066381156316917, "grad_norm": 0.2687735724291379, "learning_rate": 4.25e-05, "loss": 0.4442, "step": 330 }, { "epoch": 0.708779443254818, "grad_norm": 0.21677485542153913, "learning_rate": 4.2460317460317464e-05, "loss": 0.4389, "step": 331 }, { "epoch": 0.7109207708779444, "grad_norm": 0.30463445202519995, "learning_rate": 4.2420634920634925e-05, "loss": 0.4641, "step": 332 }, { "epoch": 0.7130620985010707, "grad_norm": 0.25037659567634474, "learning_rate": 4.2380952380952385e-05, "loss": 0.4601, "step": 333 }, { "epoch": 0.715203426124197, "grad_norm": 0.2597863908544412, "learning_rate": 4.2341269841269846e-05, "loss": 0.4692, "step": 334 }, { "epoch": 0.7173447537473233, "grad_norm": 0.26953376114396266, "learning_rate": 4.23015873015873e-05, "loss": 0.454, "step": 335 }, { "epoch": 0.7194860813704497, "grad_norm": 0.246208461301376, "learning_rate": 4.226190476190476e-05, "loss": 0.463, "step": 336 }, { "epoch": 0.721627408993576, "grad_norm": 0.2370326879028165, "learning_rate": 4.222222222222222e-05, "loss": 0.4467, "step": 337 }, { "epoch": 0.7237687366167024, "grad_norm": 0.25036860829476676, "learning_rate": 4.218253968253968e-05, "loss": 0.4411, "step": 338 }, { "epoch": 0.7259100642398287, "grad_norm": 0.28329537291272733, "learning_rate": 4.214285714285714e-05, "loss": 0.454, "step": 339 }, { "epoch": 0.728051391862955, "grad_norm": 0.24559865290348679, "learning_rate": 4.2103174603174604e-05, "loss": 0.4498, "step": 340 }, { "epoch": 0.7301927194860813, "grad_norm": 0.23500115967335444, "learning_rate": 4.2063492063492065e-05, "loss": 0.4552, "step": 341 }, { "epoch": 0.7323340471092077, "grad_norm": 0.2919643243289569, "learning_rate": 4.2023809523809525e-05, "loss": 0.4567, "step": 342 }, { "epoch": 0.734475374732334, "grad_norm": 0.26791366452779686, "learning_rate": 4.1984126984126986e-05, "loss": 0.4442, "step": 343 }, { "epoch": 0.7366167023554604, "grad_norm": 0.28291332973939526, "learning_rate": 4.194444444444445e-05, "loss": 0.451, "step": 344 }, { "epoch": 0.7387580299785867, "grad_norm": 0.25359000867556963, "learning_rate": 4.190476190476191e-05, "loss": 0.4425, "step": 345 }, { "epoch": 0.7408993576017131, "grad_norm": 0.2408660681176407, "learning_rate": 4.186507936507937e-05, "loss": 0.4493, "step": 346 }, { "epoch": 0.7430406852248393, "grad_norm": 0.3349449875093207, "learning_rate": 4.182539682539683e-05, "loss": 0.4592, "step": 347 }, { "epoch": 0.7451820128479657, "grad_norm": 0.2559707605515902, "learning_rate": 4.178571428571429e-05, "loss": 0.4538, "step": 348 }, { "epoch": 0.7473233404710921, "grad_norm": 0.2737394625735486, "learning_rate": 4.174603174603175e-05, "loss": 0.4563, "step": 349 }, { "epoch": 0.7494646680942184, "grad_norm": 0.28364900087424905, "learning_rate": 4.170634920634921e-05, "loss": 0.4741, "step": 350 }, { "epoch": 0.7516059957173448, "grad_norm": 0.2741462100795298, "learning_rate": 4.166666666666667e-05, "loss": 0.4363, "step": 351 }, { "epoch": 0.7537473233404711, "grad_norm": 0.25635492049633196, "learning_rate": 4.162698412698413e-05, "loss": 0.4445, "step": 352 }, { "epoch": 0.7558886509635975, "grad_norm": 0.273452663060448, "learning_rate": 4.1587301587301594e-05, "loss": 0.4408, "step": 353 }, { "epoch": 0.7580299785867237, "grad_norm": 0.32668360058869866, "learning_rate": 4.1547619047619054e-05, "loss": 0.4653, "step": 354 }, { "epoch": 0.7601713062098501, "grad_norm": 0.28322113864222886, "learning_rate": 4.1507936507936515e-05, "loss": 0.449, "step": 355 }, { "epoch": 0.7623126338329764, "grad_norm": 0.3314451874615497, "learning_rate": 4.1468253968253976e-05, "loss": 0.4345, "step": 356 }, { "epoch": 0.7644539614561028, "grad_norm": 0.2909325990893949, "learning_rate": 4.1428571428571437e-05, "loss": 0.4501, "step": 357 }, { "epoch": 0.7665952890792291, "grad_norm": 0.3155371913936611, "learning_rate": 4.138888888888889e-05, "loss": 0.4362, "step": 358 }, { "epoch": 0.7687366167023555, "grad_norm": 0.24516857979495404, "learning_rate": 4.134920634920635e-05, "loss": 0.4457, "step": 359 }, { "epoch": 0.7708779443254818, "grad_norm": 0.30672444383523295, "learning_rate": 4.130952380952381e-05, "loss": 0.4382, "step": 360 }, { "epoch": 0.7730192719486081, "grad_norm": 0.2622046826324259, "learning_rate": 4.126984126984127e-05, "loss": 0.4427, "step": 361 }, { "epoch": 0.7751605995717344, "grad_norm": 0.28744995963189524, "learning_rate": 4.123015873015873e-05, "loss": 0.4608, "step": 362 }, { "epoch": 0.7773019271948608, "grad_norm": 0.2566986215900858, "learning_rate": 4.119047619047619e-05, "loss": 0.4395, "step": 363 }, { "epoch": 0.7794432548179872, "grad_norm": 0.2754243217704112, "learning_rate": 4.115079365079365e-05, "loss": 0.4472, "step": 364 }, { "epoch": 0.7815845824411135, "grad_norm": 0.2679510555818145, "learning_rate": 4.111111111111111e-05, "loss": 0.4364, "step": 365 }, { "epoch": 0.7837259100642399, "grad_norm": 0.22802893505394484, "learning_rate": 4.107142857142857e-05, "loss": 0.4341, "step": 366 }, { "epoch": 0.7858672376873662, "grad_norm": 0.25400510440151647, "learning_rate": 4.103174603174603e-05, "loss": 0.446, "step": 367 }, { "epoch": 0.7880085653104925, "grad_norm": 0.24898639805458245, "learning_rate": 4.099206349206349e-05, "loss": 0.4417, "step": 368 }, { "epoch": 0.7901498929336188, "grad_norm": 0.2820704635610971, "learning_rate": 4.095238095238095e-05, "loss": 0.4518, "step": 369 }, { "epoch": 0.7922912205567452, "grad_norm": 0.23351424195098205, "learning_rate": 4.091269841269841e-05, "loss": 0.4515, "step": 370 }, { "epoch": 0.7944325481798715, "grad_norm": 0.2815505680256724, "learning_rate": 4.0873015873015874e-05, "loss": 0.4296, "step": 371 }, { "epoch": 0.7965738758029979, "grad_norm": 0.23872824455145594, "learning_rate": 4.0833333333333334e-05, "loss": 0.4253, "step": 372 }, { "epoch": 0.7987152034261242, "grad_norm": 0.2509283478607524, "learning_rate": 4.0793650793650795e-05, "loss": 0.4272, "step": 373 }, { "epoch": 0.8008565310492506, "grad_norm": 0.23260890926948258, "learning_rate": 4.0753968253968256e-05, "loss": 0.4422, "step": 374 }, { "epoch": 0.8029978586723768, "grad_norm": 0.24640220531762067, "learning_rate": 4.0714285714285717e-05, "loss": 0.4325, "step": 375 }, { "epoch": 0.8051391862955032, "grad_norm": 0.2557935473418916, "learning_rate": 4.067460317460318e-05, "loss": 0.4495, "step": 376 }, { "epoch": 0.8072805139186295, "grad_norm": 0.24864370999195623, "learning_rate": 4.063492063492064e-05, "loss": 0.44, "step": 377 }, { "epoch": 0.8094218415417559, "grad_norm": 0.2494337325169666, "learning_rate": 4.05952380952381e-05, "loss": 0.4292, "step": 378 }, { "epoch": 0.8115631691648822, "grad_norm": 0.26643265488032447, "learning_rate": 4.055555555555556e-05, "loss": 0.4258, "step": 379 }, { "epoch": 0.8137044967880086, "grad_norm": 0.29981525233093126, "learning_rate": 4.051587301587302e-05, "loss": 0.4462, "step": 380 }, { "epoch": 0.815845824411135, "grad_norm": 0.23483623618223876, "learning_rate": 4.047619047619048e-05, "loss": 0.4403, "step": 381 }, { "epoch": 0.8179871520342612, "grad_norm": 0.31621068938716507, "learning_rate": 4.043650793650794e-05, "loss": 0.4542, "step": 382 }, { "epoch": 0.8201284796573876, "grad_norm": 0.27447908054014813, "learning_rate": 4.03968253968254e-05, "loss": 0.4498, "step": 383 }, { "epoch": 0.8222698072805139, "grad_norm": 0.2979087905677751, "learning_rate": 4.035714285714286e-05, "loss": 0.4326, "step": 384 }, { "epoch": 0.8244111349036403, "grad_norm": 0.2738788951804694, "learning_rate": 4.031746031746032e-05, "loss": 0.4433, "step": 385 }, { "epoch": 0.8265524625267666, "grad_norm": 0.27184798198740845, "learning_rate": 4.027777777777778e-05, "loss": 0.4385, "step": 386 }, { "epoch": 0.828693790149893, "grad_norm": 0.2596242318887042, "learning_rate": 4.023809523809524e-05, "loss": 0.4391, "step": 387 }, { "epoch": 0.8308351177730193, "grad_norm": 0.3257005719389779, "learning_rate": 4.01984126984127e-05, "loss": 0.4515, "step": 388 }, { "epoch": 0.8329764453961456, "grad_norm": 0.26020892798323664, "learning_rate": 4.015873015873016e-05, "loss": 0.4525, "step": 389 }, { "epoch": 0.8351177730192719, "grad_norm": 0.3282937862319744, "learning_rate": 4.011904761904762e-05, "loss": 0.4522, "step": 390 }, { "epoch": 0.8372591006423983, "grad_norm": 0.3036258464302225, "learning_rate": 4.007936507936508e-05, "loss": 0.4477, "step": 391 }, { "epoch": 0.8394004282655246, "grad_norm": 0.26338851157250837, "learning_rate": 4.003968253968254e-05, "loss": 0.4422, "step": 392 }, { "epoch": 0.841541755888651, "grad_norm": 0.3106182811071532, "learning_rate": 4e-05, "loss": 0.4309, "step": 393 }, { "epoch": 0.8436830835117773, "grad_norm": 0.24266766603453416, "learning_rate": 3.9960317460317464e-05, "loss": 0.4466, "step": 394 }, { "epoch": 0.8458244111349036, "grad_norm": 0.34811764903452264, "learning_rate": 3.9920634920634925e-05, "loss": 0.452, "step": 395 }, { "epoch": 0.8479657387580299, "grad_norm": 0.24788695830598698, "learning_rate": 3.9880952380952386e-05, "loss": 0.4433, "step": 396 }, { "epoch": 0.8501070663811563, "grad_norm": 0.2783115131710977, "learning_rate": 3.984126984126984e-05, "loss": 0.4561, "step": 397 }, { "epoch": 0.8522483940042827, "grad_norm": 0.2576730623053907, "learning_rate": 3.98015873015873e-05, "loss": 0.4708, "step": 398 }, { "epoch": 0.854389721627409, "grad_norm": 0.24712208382727868, "learning_rate": 3.976190476190476e-05, "loss": 0.449, "step": 399 }, { "epoch": 0.8565310492505354, "grad_norm": 0.2708918582747635, "learning_rate": 3.972222222222222e-05, "loss": 0.446, "step": 400 }, { "epoch": 0.8586723768736617, "grad_norm": 0.2713784850892593, "learning_rate": 3.968253968253968e-05, "loss": 0.4386, "step": 401 }, { "epoch": 0.860813704496788, "grad_norm": 0.25255493559196973, "learning_rate": 3.964285714285714e-05, "loss": 0.44, "step": 402 }, { "epoch": 0.8629550321199143, "grad_norm": 0.27193870527806324, "learning_rate": 3.9603174603174604e-05, "loss": 0.4485, "step": 403 }, { "epoch": 0.8650963597430407, "grad_norm": 0.27126270170714106, "learning_rate": 3.9563492063492065e-05, "loss": 0.4525, "step": 404 }, { "epoch": 0.867237687366167, "grad_norm": 0.24844125447179155, "learning_rate": 3.9523809523809526e-05, "loss": 0.4336, "step": 405 }, { "epoch": 0.8693790149892934, "grad_norm": 0.30939011775053954, "learning_rate": 3.9484126984126986e-05, "loss": 0.4494, "step": 406 }, { "epoch": 0.8715203426124197, "grad_norm": 0.26378548519547557, "learning_rate": 3.944444444444445e-05, "loss": 0.447, "step": 407 }, { "epoch": 0.8736616702355461, "grad_norm": 0.2849481666705005, "learning_rate": 3.940476190476191e-05, "loss": 0.4583, "step": 408 }, { "epoch": 0.8758029978586723, "grad_norm": 0.2815143992545685, "learning_rate": 3.936507936507937e-05, "loss": 0.4348, "step": 409 }, { "epoch": 0.8779443254817987, "grad_norm": 0.27153299884477455, "learning_rate": 3.932539682539683e-05, "loss": 0.4434, "step": 410 }, { "epoch": 0.880085653104925, "grad_norm": 0.27535841889086504, "learning_rate": 3.928571428571429e-05, "loss": 0.4398, "step": 411 }, { "epoch": 0.8822269807280514, "grad_norm": 0.30776377057560106, "learning_rate": 3.9246031746031744e-05, "loss": 0.4464, "step": 412 }, { "epoch": 0.8843683083511777, "grad_norm": 0.24429372628536725, "learning_rate": 3.9206349206349205e-05, "loss": 0.4345, "step": 413 }, { "epoch": 0.8865096359743041, "grad_norm": 0.34967840303013203, "learning_rate": 3.9166666666666665e-05, "loss": 0.447, "step": 414 }, { "epoch": 0.8886509635974305, "grad_norm": 0.25224213794765044, "learning_rate": 3.9126984126984126e-05, "loss": 0.4477, "step": 415 }, { "epoch": 0.8907922912205567, "grad_norm": 0.2899965867095618, "learning_rate": 3.908730158730159e-05, "loss": 0.4183, "step": 416 }, { "epoch": 0.892933618843683, "grad_norm": 0.3091076754528513, "learning_rate": 3.904761904761905e-05, "loss": 0.4438, "step": 417 }, { "epoch": 0.8950749464668094, "grad_norm": 0.25144689022877925, "learning_rate": 3.900793650793651e-05, "loss": 0.4467, "step": 418 }, { "epoch": 0.8972162740899358, "grad_norm": 0.3164686564857124, "learning_rate": 3.896825396825397e-05, "loss": 0.4348, "step": 419 }, { "epoch": 0.8993576017130621, "grad_norm": 0.2879131478620461, "learning_rate": 3.892857142857143e-05, "loss": 0.4546, "step": 420 }, { "epoch": 0.9014989293361885, "grad_norm": 0.2718535367036997, "learning_rate": 3.888888888888889e-05, "loss": 0.4493, "step": 421 }, { "epoch": 0.9036402569593148, "grad_norm": 0.3215237762816025, "learning_rate": 3.884920634920635e-05, "loss": 0.4324, "step": 422 }, { "epoch": 0.9057815845824411, "grad_norm": 0.23703610924543467, "learning_rate": 3.880952380952381e-05, "loss": 0.4423, "step": 423 }, { "epoch": 0.9079229122055674, "grad_norm": 0.2790375906958242, "learning_rate": 3.876984126984127e-05, "loss": 0.459, "step": 424 }, { "epoch": 0.9100642398286938, "grad_norm": 0.31216707197799737, "learning_rate": 3.8730158730158734e-05, "loss": 0.4358, "step": 425 }, { "epoch": 0.9122055674518201, "grad_norm": 0.2238572953572114, "learning_rate": 3.8690476190476195e-05, "loss": 0.4329, "step": 426 }, { "epoch": 0.9143468950749465, "grad_norm": 0.3300050366021809, "learning_rate": 3.8650793650793655e-05, "loss": 0.4411, "step": 427 }, { "epoch": 0.9164882226980728, "grad_norm": 0.24904767848985657, "learning_rate": 3.8611111111111116e-05, "loss": 0.4395, "step": 428 }, { "epoch": 0.9186295503211992, "grad_norm": 0.2558995321875151, "learning_rate": 3.857142857142858e-05, "loss": 0.4325, "step": 429 }, { "epoch": 0.9207708779443254, "grad_norm": 0.2810339034644166, "learning_rate": 3.853174603174604e-05, "loss": 0.4333, "step": 430 }, { "epoch": 0.9229122055674518, "grad_norm": 0.25199173622671855, "learning_rate": 3.84920634920635e-05, "loss": 0.4516, "step": 431 }, { "epoch": 0.9250535331905781, "grad_norm": 0.27563768688654877, "learning_rate": 3.845238095238096e-05, "loss": 0.4306, "step": 432 }, { "epoch": 0.9271948608137045, "grad_norm": 0.24180893154053762, "learning_rate": 3.841269841269842e-05, "loss": 0.4327, "step": 433 }, { "epoch": 0.9293361884368309, "grad_norm": 0.232187215372479, "learning_rate": 3.837301587301588e-05, "loss": 0.4383, "step": 434 }, { "epoch": 0.9314775160599572, "grad_norm": 0.2524306893162537, "learning_rate": 3.8333333333333334e-05, "loss": 0.4383, "step": 435 }, { "epoch": 0.9336188436830836, "grad_norm": 0.21917495109126314, "learning_rate": 3.8293650793650795e-05, "loss": 0.442, "step": 436 }, { "epoch": 0.9357601713062098, "grad_norm": 0.2491947970666118, "learning_rate": 3.8253968253968256e-05, "loss": 0.4284, "step": 437 }, { "epoch": 0.9379014989293362, "grad_norm": 0.24928999737243862, "learning_rate": 3.821428571428572e-05, "loss": 0.4366, "step": 438 }, { "epoch": 0.9400428265524625, "grad_norm": 0.2696841956340179, "learning_rate": 3.817460317460317e-05, "loss": 0.4426, "step": 439 }, { "epoch": 0.9421841541755889, "grad_norm": 0.22630244921640252, "learning_rate": 3.813492063492063e-05, "loss": 0.4363, "step": 440 }, { "epoch": 0.9443254817987152, "grad_norm": 0.2600201845565814, "learning_rate": 3.809523809523809e-05, "loss": 0.4484, "step": 441 }, { "epoch": 0.9464668094218416, "grad_norm": 0.24580135509156412, "learning_rate": 3.805555555555555e-05, "loss": 0.4671, "step": 442 }, { "epoch": 0.9486081370449678, "grad_norm": 0.29113072327829725, "learning_rate": 3.8015873015873014e-05, "loss": 0.451, "step": 443 }, { "epoch": 0.9507494646680942, "grad_norm": 0.24308118122648267, "learning_rate": 3.7976190476190474e-05, "loss": 0.4564, "step": 444 }, { "epoch": 0.9528907922912205, "grad_norm": 0.3071448001103864, "learning_rate": 3.7936507936507935e-05, "loss": 0.4397, "step": 445 }, { "epoch": 0.9550321199143469, "grad_norm": 0.2420827295559477, "learning_rate": 3.7896825396825396e-05, "loss": 0.4454, "step": 446 }, { "epoch": 0.9571734475374732, "grad_norm": 0.23925961181164737, "learning_rate": 3.785714285714286e-05, "loss": 0.4422, "step": 447 }, { "epoch": 0.9593147751605996, "grad_norm": 0.27857277637461475, "learning_rate": 3.781746031746032e-05, "loss": 0.4451, "step": 448 }, { "epoch": 0.961456102783726, "grad_norm": 0.25387578048055814, "learning_rate": 3.777777777777778e-05, "loss": 0.454, "step": 449 }, { "epoch": 0.9635974304068522, "grad_norm": 0.2365009172752972, "learning_rate": 3.773809523809524e-05, "loss": 0.4477, "step": 450 }, { "epoch": 0.9657387580299786, "grad_norm": 0.24886378867096692, "learning_rate": 3.76984126984127e-05, "loss": 0.4455, "step": 451 }, { "epoch": 0.9678800856531049, "grad_norm": 0.22709536457385576, "learning_rate": 3.765873015873016e-05, "loss": 0.4421, "step": 452 }, { "epoch": 0.9700214132762313, "grad_norm": 0.24437838490375288, "learning_rate": 3.761904761904762e-05, "loss": 0.4407, "step": 453 }, { "epoch": 0.9721627408993576, "grad_norm": 0.25704107343861016, "learning_rate": 3.757936507936508e-05, "loss": 0.4688, "step": 454 }, { "epoch": 0.974304068522484, "grad_norm": 0.22904949606864244, "learning_rate": 3.753968253968254e-05, "loss": 0.4293, "step": 455 }, { "epoch": 0.9764453961456103, "grad_norm": 0.2849795459258746, "learning_rate": 3.7500000000000003e-05, "loss": 0.4462, "step": 456 }, { "epoch": 0.9785867237687366, "grad_norm": 0.25874699145604835, "learning_rate": 3.7460317460317464e-05, "loss": 0.4533, "step": 457 }, { "epoch": 0.9807280513918629, "grad_norm": 0.3152490582808027, "learning_rate": 3.7420634920634925e-05, "loss": 0.4407, "step": 458 }, { "epoch": 0.9828693790149893, "grad_norm": 0.26375789665291616, "learning_rate": 3.7380952380952386e-05, "loss": 0.4465, "step": 459 }, { "epoch": 0.9850107066381156, "grad_norm": 0.29544790745591465, "learning_rate": 3.7341269841269846e-05, "loss": 0.4488, "step": 460 }, { "epoch": 0.987152034261242, "grad_norm": 0.2825194356752145, "learning_rate": 3.730158730158731e-05, "loss": 0.4254, "step": 461 }, { "epoch": 0.9892933618843683, "grad_norm": 0.24723341454362188, "learning_rate": 3.726190476190476e-05, "loss": 0.4313, "step": 462 }, { "epoch": 0.9914346895074947, "grad_norm": 0.23944247120858028, "learning_rate": 3.722222222222222e-05, "loss": 0.4297, "step": 463 }, { "epoch": 0.9935760171306209, "grad_norm": 0.25318330122355875, "learning_rate": 3.718253968253968e-05, "loss": 0.4248, "step": 464 }, { "epoch": 0.9957173447537473, "grad_norm": 0.2385083732481541, "learning_rate": 3.7142857142857143e-05, "loss": 0.4342, "step": 465 }, { "epoch": 0.9978586723768736, "grad_norm": 0.25267257737774884, "learning_rate": 3.7103174603174604e-05, "loss": 0.4244, "step": 466 }, { "epoch": 1.0, "grad_norm": 0.26302764538521656, "learning_rate": 3.7063492063492065e-05, "loss": 0.4197, "step": 467 }, { "epoch": 1.0021413276231264, "grad_norm": 0.3321143759595324, "learning_rate": 3.7023809523809526e-05, "loss": 0.3746, "step": 468 }, { "epoch": 1.0042826552462527, "grad_norm": 0.2523859067388859, "learning_rate": 3.6984126984126986e-05, "loss": 0.3629, "step": 469 }, { "epoch": 1.006423982869379, "grad_norm": 0.2740491528188909, "learning_rate": 3.694444444444445e-05, "loss": 0.3739, "step": 470 }, { "epoch": 1.0085653104925054, "grad_norm": 0.34376513245262397, "learning_rate": 3.690476190476191e-05, "loss": 0.366, "step": 471 }, { "epoch": 1.0107066381156318, "grad_norm": 0.28059137656399624, "learning_rate": 3.686507936507937e-05, "loss": 0.3727, "step": 472 }, { "epoch": 1.0128479657387581, "grad_norm": 0.33729133415206547, "learning_rate": 3.682539682539683e-05, "loss": 0.3657, "step": 473 }, { "epoch": 1.0149892933618843, "grad_norm": 0.3201683000077876, "learning_rate": 3.678571428571429e-05, "loss": 0.3802, "step": 474 }, { "epoch": 1.0171306209850106, "grad_norm": 0.274999594299655, "learning_rate": 3.674603174603175e-05, "loss": 0.3772, "step": 475 }, { "epoch": 1.019271948608137, "grad_norm": 0.31109609046109504, "learning_rate": 3.6706349206349205e-05, "loss": 0.3795, "step": 476 }, { "epoch": 1.0214132762312633, "grad_norm": 0.2575665550459576, "learning_rate": 3.6666666666666666e-05, "loss": 0.376, "step": 477 }, { "epoch": 1.0235546038543897, "grad_norm": 0.2937108429839069, "learning_rate": 3.6626984126984126e-05, "loss": 0.3638, "step": 478 }, { "epoch": 1.025695931477516, "grad_norm": 0.2511369961460089, "learning_rate": 3.658730158730159e-05, "loss": 0.3725, "step": 479 }, { "epoch": 1.0278372591006424, "grad_norm": 0.2839057922395332, "learning_rate": 3.654761904761905e-05, "loss": 0.3627, "step": 480 }, { "epoch": 1.0299785867237687, "grad_norm": 0.2920184281736407, "learning_rate": 3.650793650793651e-05, "loss": 0.3808, "step": 481 }, { "epoch": 1.032119914346895, "grad_norm": 0.233470618770099, "learning_rate": 3.646825396825397e-05, "loss": 0.3617, "step": 482 }, { "epoch": 1.0342612419700214, "grad_norm": 0.280577964771194, "learning_rate": 3.642857142857143e-05, "loss": 0.3625, "step": 483 }, { "epoch": 1.0364025695931478, "grad_norm": 0.24608809205741292, "learning_rate": 3.638888888888889e-05, "loss": 0.3646, "step": 484 }, { "epoch": 1.0385438972162742, "grad_norm": 0.2200793066660099, "learning_rate": 3.634920634920635e-05, "loss": 0.3593, "step": 485 }, { "epoch": 1.0406852248394005, "grad_norm": 0.3327179418533381, "learning_rate": 3.630952380952381e-05, "loss": 0.3933, "step": 486 }, { "epoch": 1.0428265524625269, "grad_norm": 0.22445951219430535, "learning_rate": 3.626984126984127e-05, "loss": 0.3591, "step": 487 }, { "epoch": 1.044967880085653, "grad_norm": 0.2496038218726081, "learning_rate": 3.6230158730158734e-05, "loss": 0.3858, "step": 488 }, { "epoch": 1.0471092077087794, "grad_norm": 0.31223282629773935, "learning_rate": 3.619047619047619e-05, "loss": 0.3833, "step": 489 }, { "epoch": 1.0492505353319057, "grad_norm": 0.2500697247329698, "learning_rate": 3.615079365079365e-05, "loss": 0.3675, "step": 490 }, { "epoch": 1.051391862955032, "grad_norm": 0.2569196171542971, "learning_rate": 3.611111111111111e-05, "loss": 0.3682, "step": 491 }, { "epoch": 1.0535331905781584, "grad_norm": 0.2862772036703233, "learning_rate": 3.607142857142857e-05, "loss": 0.3521, "step": 492 }, { "epoch": 1.0556745182012848, "grad_norm": 0.22001094989053036, "learning_rate": 3.603174603174603e-05, "loss": 0.3737, "step": 493 }, { "epoch": 1.0578158458244111, "grad_norm": 0.25763962840222093, "learning_rate": 3.599206349206349e-05, "loss": 0.3559, "step": 494 }, { "epoch": 1.0599571734475375, "grad_norm": 0.26453793828520467, "learning_rate": 3.595238095238095e-05, "loss": 0.3764, "step": 495 }, { "epoch": 1.0620985010706638, "grad_norm": 0.24859096396107916, "learning_rate": 3.591269841269841e-05, "loss": 0.3646, "step": 496 }, { "epoch": 1.0642398286937902, "grad_norm": 0.24879931824582785, "learning_rate": 3.5873015873015874e-05, "loss": 0.3856, "step": 497 }, { "epoch": 1.0663811563169165, "grad_norm": 0.26070745567359305, "learning_rate": 3.5833333333333335e-05, "loss": 0.3776, "step": 498 }, { "epoch": 1.068522483940043, "grad_norm": 0.25811232439255394, "learning_rate": 3.5793650793650795e-05, "loss": 0.3668, "step": 499 }, { "epoch": 1.0706638115631693, "grad_norm": 0.23394126650129565, "learning_rate": 3.5753968253968256e-05, "loss": 0.3644, "step": 500 }, { "epoch": 1.0728051391862956, "grad_norm": 0.28885516256092003, "learning_rate": 3.571428571428572e-05, "loss": 0.3697, "step": 501 }, { "epoch": 1.0749464668094217, "grad_norm": 0.22243797367983625, "learning_rate": 3.567460317460318e-05, "loss": 0.3624, "step": 502 }, { "epoch": 1.077087794432548, "grad_norm": 0.25222271031887367, "learning_rate": 3.563492063492064e-05, "loss": 0.3719, "step": 503 }, { "epoch": 1.0792291220556745, "grad_norm": 0.26965537434104925, "learning_rate": 3.55952380952381e-05, "loss": 0.3959, "step": 504 }, { "epoch": 1.0813704496788008, "grad_norm": 0.24050297443124782, "learning_rate": 3.555555555555556e-05, "loss": 0.3581, "step": 505 }, { "epoch": 1.0835117773019272, "grad_norm": 0.2687861602822808, "learning_rate": 3.551587301587302e-05, "loss": 0.3758, "step": 506 }, { "epoch": 1.0856531049250535, "grad_norm": 0.2140853649825794, "learning_rate": 3.547619047619048e-05, "loss": 0.3806, "step": 507 }, { "epoch": 1.0877944325481799, "grad_norm": 0.24365107193268598, "learning_rate": 3.543650793650794e-05, "loss": 0.3731, "step": 508 }, { "epoch": 1.0899357601713062, "grad_norm": 0.24553711140882334, "learning_rate": 3.53968253968254e-05, "loss": 0.3478, "step": 509 }, { "epoch": 1.0920770877944326, "grad_norm": 0.2489843525995639, "learning_rate": 3.5357142857142864e-05, "loss": 0.3639, "step": 510 }, { "epoch": 1.094218415417559, "grad_norm": 0.2847640243532346, "learning_rate": 3.5317460317460324e-05, "loss": 0.3652, "step": 511 }, { "epoch": 1.0963597430406853, "grad_norm": 0.24864833640916995, "learning_rate": 3.527777777777778e-05, "loss": 0.3771, "step": 512 }, { "epoch": 1.0985010706638116, "grad_norm": 0.24597951146232283, "learning_rate": 3.523809523809524e-05, "loss": 0.3474, "step": 513 }, { "epoch": 1.100642398286938, "grad_norm": 0.2710332839771354, "learning_rate": 3.51984126984127e-05, "loss": 0.3611, "step": 514 }, { "epoch": 1.1027837259100641, "grad_norm": 0.24106875283916565, "learning_rate": 3.515873015873016e-05, "loss": 0.3694, "step": 515 }, { "epoch": 1.1049250535331905, "grad_norm": 0.25198007958486407, "learning_rate": 3.511904761904762e-05, "loss": 0.3676, "step": 516 }, { "epoch": 1.1070663811563168, "grad_norm": 0.285475860805922, "learning_rate": 3.5079365079365075e-05, "loss": 0.3694, "step": 517 }, { "epoch": 1.1092077087794432, "grad_norm": 0.232641447262773, "learning_rate": 3.5039682539682536e-05, "loss": 0.3543, "step": 518 }, { "epoch": 1.1113490364025695, "grad_norm": 0.26148277895318806, "learning_rate": 3.5e-05, "loss": 0.3737, "step": 519 }, { "epoch": 1.113490364025696, "grad_norm": 0.2522296596506257, "learning_rate": 3.496031746031746e-05, "loss": 0.3685, "step": 520 }, { "epoch": 1.1156316916488223, "grad_norm": 0.24283436286680268, "learning_rate": 3.492063492063492e-05, "loss": 0.3607, "step": 521 }, { "epoch": 1.1177730192719486, "grad_norm": 0.2546190850581306, "learning_rate": 3.488095238095238e-05, "loss": 0.379, "step": 522 }, { "epoch": 1.119914346895075, "grad_norm": 0.2695857257313346, "learning_rate": 3.484126984126984e-05, "loss": 0.3791, "step": 523 }, { "epoch": 1.1220556745182013, "grad_norm": 0.26122674703315873, "learning_rate": 3.48015873015873e-05, "loss": 0.3654, "step": 524 }, { "epoch": 1.1241970021413277, "grad_norm": 0.23999078841945964, "learning_rate": 3.476190476190476e-05, "loss": 0.3876, "step": 525 }, { "epoch": 1.126338329764454, "grad_norm": 0.27711890693427227, "learning_rate": 3.472222222222222e-05, "loss": 0.3678, "step": 526 }, { "epoch": 1.1284796573875804, "grad_norm": 0.22153222863862254, "learning_rate": 3.468253968253968e-05, "loss": 0.3695, "step": 527 }, { "epoch": 1.1306209850107067, "grad_norm": 0.2367741399209089, "learning_rate": 3.4642857142857144e-05, "loss": 0.3619, "step": 528 }, { "epoch": 1.132762312633833, "grad_norm": 0.2279683733856979, "learning_rate": 3.4603174603174604e-05, "loss": 0.3602, "step": 529 }, { "epoch": 1.1349036402569592, "grad_norm": 0.21797256781445612, "learning_rate": 3.4563492063492065e-05, "loss": 0.3538, "step": 530 }, { "epoch": 1.1370449678800856, "grad_norm": 0.23243584252507798, "learning_rate": 3.4523809523809526e-05, "loss": 0.3676, "step": 531 }, { "epoch": 1.139186295503212, "grad_norm": 0.22337304786840087, "learning_rate": 3.448412698412699e-05, "loss": 0.35, "step": 532 }, { "epoch": 1.1413276231263383, "grad_norm": 0.2723870117088924, "learning_rate": 3.444444444444445e-05, "loss": 0.3681, "step": 533 }, { "epoch": 1.1434689507494646, "grad_norm": 0.23683938267697505, "learning_rate": 3.440476190476191e-05, "loss": 0.3584, "step": 534 }, { "epoch": 1.145610278372591, "grad_norm": 0.27263835901881306, "learning_rate": 3.436507936507937e-05, "loss": 0.3585, "step": 535 }, { "epoch": 1.1477516059957173, "grad_norm": 0.2413247019766125, "learning_rate": 3.432539682539683e-05, "loss": 0.3715, "step": 536 }, { "epoch": 1.1498929336188437, "grad_norm": 0.24835573947131598, "learning_rate": 3.428571428571429e-05, "loss": 0.3834, "step": 537 }, { "epoch": 1.15203426124197, "grad_norm": 0.24833529249782688, "learning_rate": 3.424603174603175e-05, "loss": 0.3753, "step": 538 }, { "epoch": 1.1541755888650964, "grad_norm": 0.2271111061767862, "learning_rate": 3.420634920634921e-05, "loss": 0.3751, "step": 539 }, { "epoch": 1.1563169164882228, "grad_norm": 0.2432938466019091, "learning_rate": 3.4166666666666666e-05, "loss": 0.3521, "step": 540 }, { "epoch": 1.1584582441113491, "grad_norm": 0.2458707748193166, "learning_rate": 3.412698412698413e-05, "loss": 0.3583, "step": 541 }, { "epoch": 1.1605995717344753, "grad_norm": 0.23943667615924907, "learning_rate": 3.408730158730159e-05, "loss": 0.3623, "step": 542 }, { "epoch": 1.1627408993576016, "grad_norm": 0.29767722341183855, "learning_rate": 3.404761904761905e-05, "loss": 0.3624, "step": 543 }, { "epoch": 1.164882226980728, "grad_norm": 0.2643286960690453, "learning_rate": 3.400793650793651e-05, "loss": 0.3688, "step": 544 }, { "epoch": 1.1670235546038543, "grad_norm": 0.261706842552651, "learning_rate": 3.396825396825397e-05, "loss": 0.363, "step": 545 }, { "epoch": 1.1691648822269807, "grad_norm": 0.22632083311117027, "learning_rate": 3.392857142857143e-05, "loss": 0.37, "step": 546 }, { "epoch": 1.171306209850107, "grad_norm": 0.24969845223117051, "learning_rate": 3.388888888888889e-05, "loss": 0.371, "step": 547 }, { "epoch": 1.1734475374732334, "grad_norm": 0.2653686082584629, "learning_rate": 3.384920634920635e-05, "loss": 0.3641, "step": 548 }, { "epoch": 1.1755888650963597, "grad_norm": 0.26484419864560405, "learning_rate": 3.380952380952381e-05, "loss": 0.3532, "step": 549 }, { "epoch": 1.177730192719486, "grad_norm": 0.23731845700258258, "learning_rate": 3.3769841269841273e-05, "loss": 0.3844, "step": 550 }, { "epoch": 1.1798715203426124, "grad_norm": 0.26369894447851056, "learning_rate": 3.3730158730158734e-05, "loss": 0.3535, "step": 551 }, { "epoch": 1.1820128479657388, "grad_norm": 0.25523841664126234, "learning_rate": 3.3690476190476195e-05, "loss": 0.3915, "step": 552 }, { "epoch": 1.1841541755888652, "grad_norm": 0.21550220946043203, "learning_rate": 3.3650793650793656e-05, "loss": 0.3623, "step": 553 }, { "epoch": 1.1862955032119915, "grad_norm": 0.24110638888192076, "learning_rate": 3.3611111111111116e-05, "loss": 0.3499, "step": 554 }, { "epoch": 1.1884368308351179, "grad_norm": 0.22464037830444605, "learning_rate": 3.357142857142857e-05, "loss": 0.3578, "step": 555 }, { "epoch": 1.1905781584582442, "grad_norm": 0.21226847946734964, "learning_rate": 3.353174603174603e-05, "loss": 0.3634, "step": 556 }, { "epoch": 1.1927194860813706, "grad_norm": 0.2626157214230889, "learning_rate": 3.349206349206349e-05, "loss": 0.3589, "step": 557 }, { "epoch": 1.1948608137044967, "grad_norm": 0.24071856058212684, "learning_rate": 3.345238095238095e-05, "loss": 0.3863, "step": 558 }, { "epoch": 1.197002141327623, "grad_norm": 0.2508127197520457, "learning_rate": 3.3412698412698413e-05, "loss": 0.386, "step": 559 }, { "epoch": 1.1991434689507494, "grad_norm": 0.30389459018728837, "learning_rate": 3.3373015873015874e-05, "loss": 0.3833, "step": 560 }, { "epoch": 1.2012847965738758, "grad_norm": 0.25210041774328223, "learning_rate": 3.3333333333333335e-05, "loss": 0.3517, "step": 561 }, { "epoch": 1.2034261241970021, "grad_norm": 0.23500359192291567, "learning_rate": 3.3293650793650796e-05, "loss": 0.3605, "step": 562 }, { "epoch": 1.2055674518201285, "grad_norm": 0.2546074299402884, "learning_rate": 3.3253968253968256e-05, "loss": 0.3516, "step": 563 }, { "epoch": 1.2077087794432548, "grad_norm": 0.2709353855529332, "learning_rate": 3.321428571428572e-05, "loss": 0.3701, "step": 564 }, { "epoch": 1.2098501070663812, "grad_norm": 0.2518216621854237, "learning_rate": 3.317460317460318e-05, "loss": 0.3652, "step": 565 }, { "epoch": 1.2119914346895075, "grad_norm": 0.23203813858204622, "learning_rate": 3.313492063492064e-05, "loss": 0.357, "step": 566 }, { "epoch": 1.214132762312634, "grad_norm": 0.27322569967364363, "learning_rate": 3.309523809523809e-05, "loss": 0.3795, "step": 567 }, { "epoch": 1.2162740899357602, "grad_norm": 0.21862325914700695, "learning_rate": 3.3055555555555553e-05, "loss": 0.3803, "step": 568 }, { "epoch": 1.2184154175588866, "grad_norm": 0.23817063938434127, "learning_rate": 3.3015873015873014e-05, "loss": 0.3864, "step": 569 }, { "epoch": 1.2205567451820127, "grad_norm": 0.30402766988609314, "learning_rate": 3.2976190476190475e-05, "loss": 0.3784, "step": 570 }, { "epoch": 1.222698072805139, "grad_norm": 0.23766830428808083, "learning_rate": 3.2936507936507936e-05, "loss": 0.3588, "step": 571 }, { "epoch": 1.2248394004282654, "grad_norm": 0.25852053977967815, "learning_rate": 3.2896825396825396e-05, "loss": 0.3781, "step": 572 }, { "epoch": 1.2269807280513918, "grad_norm": 0.292675984396594, "learning_rate": 3.285714285714286e-05, "loss": 0.3791, "step": 573 }, { "epoch": 1.2291220556745182, "grad_norm": 0.22578595387659117, "learning_rate": 3.281746031746032e-05, "loss": 0.368, "step": 574 }, { "epoch": 1.2312633832976445, "grad_norm": 0.24728132039412187, "learning_rate": 3.277777777777778e-05, "loss": 0.3696, "step": 575 }, { "epoch": 1.2334047109207709, "grad_norm": 0.27218412013650006, "learning_rate": 3.273809523809524e-05, "loss": 0.3688, "step": 576 }, { "epoch": 1.2355460385438972, "grad_norm": 0.23158465192272315, "learning_rate": 3.26984126984127e-05, "loss": 0.3842, "step": 577 }, { "epoch": 1.2376873661670236, "grad_norm": 0.27482920742913025, "learning_rate": 3.265873015873016e-05, "loss": 0.3663, "step": 578 }, { "epoch": 1.23982869379015, "grad_norm": 0.25920426583780776, "learning_rate": 3.261904761904762e-05, "loss": 0.3571, "step": 579 }, { "epoch": 1.2419700214132763, "grad_norm": 0.2032778358032685, "learning_rate": 3.257936507936508e-05, "loss": 0.3488, "step": 580 }, { "epoch": 1.2441113490364026, "grad_norm": 0.2549778314308203, "learning_rate": 3.253968253968254e-05, "loss": 0.37, "step": 581 }, { "epoch": 1.246252676659529, "grad_norm": 0.25860176335450585, "learning_rate": 3.2500000000000004e-05, "loss": 0.3633, "step": 582 }, { "epoch": 1.2483940042826553, "grad_norm": 0.21568263900821674, "learning_rate": 3.2460317460317465e-05, "loss": 0.3594, "step": 583 }, { "epoch": 1.2505353319057817, "grad_norm": 0.25190898114681015, "learning_rate": 3.2420634920634925e-05, "loss": 0.3543, "step": 584 }, { "epoch": 1.252676659528908, "grad_norm": 0.21237788073269723, "learning_rate": 3.2380952380952386e-05, "loss": 0.3755, "step": 585 }, { "epoch": 1.2548179871520342, "grad_norm": 0.22878119129492097, "learning_rate": 3.234126984126985e-05, "loss": 0.3598, "step": 586 }, { "epoch": 1.2569593147751605, "grad_norm": 0.24998274367059609, "learning_rate": 3.230158730158731e-05, "loss": 0.3671, "step": 587 }, { "epoch": 1.259100642398287, "grad_norm": 0.24248609005355534, "learning_rate": 3.226190476190477e-05, "loss": 0.365, "step": 588 }, { "epoch": 1.2612419700214133, "grad_norm": 0.2502819042148945, "learning_rate": 3.222222222222223e-05, "loss": 0.3654, "step": 589 }, { "epoch": 1.2633832976445396, "grad_norm": 0.24428764588220775, "learning_rate": 3.218253968253968e-05, "loss": 0.375, "step": 590 }, { "epoch": 1.265524625267666, "grad_norm": 0.24064115102187758, "learning_rate": 3.2142857142857144e-05, "loss": 0.3646, "step": 591 }, { "epoch": 1.2676659528907923, "grad_norm": 0.2580980916190992, "learning_rate": 3.2103174603174605e-05, "loss": 0.3805, "step": 592 }, { "epoch": 1.2698072805139187, "grad_norm": 0.21389149883559067, "learning_rate": 3.2063492063492065e-05, "loss": 0.3854, "step": 593 }, { "epoch": 1.271948608137045, "grad_norm": 0.23647472042052045, "learning_rate": 3.202380952380952e-05, "loss": 0.3645, "step": 594 }, { "epoch": 1.2740899357601714, "grad_norm": 0.23724295037763593, "learning_rate": 3.198412698412698e-05, "loss": 0.3735, "step": 595 }, { "epoch": 1.2762312633832975, "grad_norm": 0.22413428930081114, "learning_rate": 3.194444444444444e-05, "loss": 0.359, "step": 596 }, { "epoch": 1.2783725910064239, "grad_norm": 0.2223223372100241, "learning_rate": 3.19047619047619e-05, "loss": 0.3491, "step": 597 }, { "epoch": 1.2805139186295502, "grad_norm": 0.21897487856488682, "learning_rate": 3.186507936507936e-05, "loss": 0.357, "step": 598 }, { "epoch": 1.2826552462526766, "grad_norm": 0.2502304419567601, "learning_rate": 3.182539682539682e-05, "loss": 0.3536, "step": 599 }, { "epoch": 1.284796573875803, "grad_norm": 0.2253482204551222, "learning_rate": 3.1785714285714284e-05, "loss": 0.3558, "step": 600 }, { "epoch": 1.2869379014989293, "grad_norm": 0.24613177754827212, "learning_rate": 3.1746031746031745e-05, "loss": 0.3515, "step": 601 }, { "epoch": 1.2890792291220556, "grad_norm": 0.2523575221500364, "learning_rate": 3.1706349206349205e-05, "loss": 0.3741, "step": 602 }, { "epoch": 1.291220556745182, "grad_norm": 0.2165503319453962, "learning_rate": 3.1666666666666666e-05, "loss": 0.3727, "step": 603 }, { "epoch": 1.2933618843683083, "grad_norm": 0.21823243034291184, "learning_rate": 3.162698412698413e-05, "loss": 0.3533, "step": 604 }, { "epoch": 1.2955032119914347, "grad_norm": 0.24997589624196248, "learning_rate": 3.158730158730159e-05, "loss": 0.3542, "step": 605 }, { "epoch": 1.297644539614561, "grad_norm": 0.2398097391159386, "learning_rate": 3.154761904761905e-05, "loss": 0.3697, "step": 606 }, { "epoch": 1.2997858672376874, "grad_norm": 0.20653894840882975, "learning_rate": 3.150793650793651e-05, "loss": 0.3541, "step": 607 }, { "epoch": 1.3019271948608138, "grad_norm": 0.24435984092571286, "learning_rate": 3.146825396825397e-05, "loss": 0.3596, "step": 608 }, { "epoch": 1.3040685224839401, "grad_norm": 0.2542985397543424, "learning_rate": 3.142857142857143e-05, "loss": 0.3583, "step": 609 }, { "epoch": 1.3062098501070665, "grad_norm": 0.20683543774079505, "learning_rate": 3.138888888888889e-05, "loss": 0.3483, "step": 610 }, { "epoch": 1.3083511777301928, "grad_norm": 0.2711230309464256, "learning_rate": 3.134920634920635e-05, "loss": 0.3616, "step": 611 }, { "epoch": 1.3104925053533192, "grad_norm": 0.24151563936505477, "learning_rate": 3.130952380952381e-05, "loss": 0.3498, "step": 612 }, { "epoch": 1.3126338329764453, "grad_norm": 0.2369073543305672, "learning_rate": 3.1269841269841274e-05, "loss": 0.366, "step": 613 }, { "epoch": 1.3147751605995717, "grad_norm": 0.24227771996000944, "learning_rate": 3.1230158730158734e-05, "loss": 0.3599, "step": 614 }, { "epoch": 1.316916488222698, "grad_norm": 0.2221305104163018, "learning_rate": 3.1190476190476195e-05, "loss": 0.3456, "step": 615 }, { "epoch": 1.3190578158458244, "grad_norm": 0.2745562820217464, "learning_rate": 3.1150793650793656e-05, "loss": 0.373, "step": 616 }, { "epoch": 1.3211991434689507, "grad_norm": 0.223755905789405, "learning_rate": 3.111111111111111e-05, "loss": 0.3743, "step": 617 }, { "epoch": 1.323340471092077, "grad_norm": 0.2540980055756857, "learning_rate": 3.107142857142857e-05, "loss": 0.3752, "step": 618 }, { "epoch": 1.3254817987152034, "grad_norm": 0.31394476060654153, "learning_rate": 3.103174603174603e-05, "loss": 0.3918, "step": 619 }, { "epoch": 1.3276231263383298, "grad_norm": 0.2326066740490904, "learning_rate": 3.099206349206349e-05, "loss": 0.3697, "step": 620 }, { "epoch": 1.3297644539614561, "grad_norm": 0.24392074483576617, "learning_rate": 3.095238095238095e-05, "loss": 0.3522, "step": 621 }, { "epoch": 1.3319057815845825, "grad_norm": 0.25390065497286196, "learning_rate": 3.0912698412698414e-05, "loss": 0.3757, "step": 622 }, { "epoch": 1.3340471092077089, "grad_norm": 0.20396843300419068, "learning_rate": 3.0873015873015874e-05, "loss": 0.3651, "step": 623 }, { "epoch": 1.336188436830835, "grad_norm": 0.22881137081512465, "learning_rate": 3.0833333333333335e-05, "loss": 0.3638, "step": 624 }, { "epoch": 1.3383297644539613, "grad_norm": 0.23141190116614863, "learning_rate": 3.0793650793650796e-05, "loss": 0.3617, "step": 625 }, { "epoch": 1.3404710920770877, "grad_norm": 0.21197621308607237, "learning_rate": 3.075396825396826e-05, "loss": 0.3533, "step": 626 }, { "epoch": 1.342612419700214, "grad_norm": 0.22886167158750198, "learning_rate": 3.071428571428572e-05, "loss": 0.3798, "step": 627 }, { "epoch": 1.3447537473233404, "grad_norm": 0.21750304837930745, "learning_rate": 3.067460317460318e-05, "loss": 0.3781, "step": 628 }, { "epoch": 1.3468950749464668, "grad_norm": 0.21938329417465666, "learning_rate": 3.063492063492064e-05, "loss": 0.3586, "step": 629 }, { "epoch": 1.3490364025695931, "grad_norm": 0.21402476475138535, "learning_rate": 3.05952380952381e-05, "loss": 0.3716, "step": 630 }, { "epoch": 1.3511777301927195, "grad_norm": 0.20938404630600568, "learning_rate": 3.055555555555556e-05, "loss": 0.37, "step": 631 }, { "epoch": 1.3533190578158458, "grad_norm": 0.21383889850254142, "learning_rate": 3.051587301587302e-05, "loss": 0.3792, "step": 632 }, { "epoch": 1.3554603854389722, "grad_norm": 0.22055010594265703, "learning_rate": 3.0476190476190482e-05, "loss": 0.3589, "step": 633 }, { "epoch": 1.3576017130620985, "grad_norm": 0.19487907168624696, "learning_rate": 3.0436507936507936e-05, "loss": 0.3535, "step": 634 }, { "epoch": 1.359743040685225, "grad_norm": 0.22306820784789425, "learning_rate": 3.0396825396825397e-05, "loss": 0.3572, "step": 635 }, { "epoch": 1.3618843683083512, "grad_norm": 0.2229559913435905, "learning_rate": 3.0357142857142857e-05, "loss": 0.3707, "step": 636 }, { "epoch": 1.3640256959314776, "grad_norm": 0.2024293507877544, "learning_rate": 3.0317460317460318e-05, "loss": 0.3638, "step": 637 }, { "epoch": 1.366167023554604, "grad_norm": 0.23534992817477, "learning_rate": 3.0277777777777776e-05, "loss": 0.365, "step": 638 }, { "epoch": 1.3683083511777303, "grad_norm": 0.22873997219956177, "learning_rate": 3.0238095238095236e-05, "loss": 0.3338, "step": 639 }, { "epoch": 1.3704496788008567, "grad_norm": 0.2211599162133884, "learning_rate": 3.0198412698412697e-05, "loss": 0.3784, "step": 640 }, { "epoch": 1.3725910064239828, "grad_norm": 0.22100903081850962, "learning_rate": 3.0158730158730158e-05, "loss": 0.3677, "step": 641 }, { "epoch": 1.3747323340471092, "grad_norm": 0.20600842295123745, "learning_rate": 3.011904761904762e-05, "loss": 0.361, "step": 642 }, { "epoch": 1.3768736616702355, "grad_norm": 0.22980010113385077, "learning_rate": 3.007936507936508e-05, "loss": 0.3465, "step": 643 }, { "epoch": 1.3790149892933619, "grad_norm": 0.21464631919266589, "learning_rate": 3.003968253968254e-05, "loss": 0.3681, "step": 644 }, { "epoch": 1.3811563169164882, "grad_norm": 0.2103030135241598, "learning_rate": 3e-05, "loss": 0.3603, "step": 645 }, { "epoch": 1.3832976445396146, "grad_norm": 0.23432239498652505, "learning_rate": 2.996031746031746e-05, "loss": 0.3676, "step": 646 }, { "epoch": 1.385438972162741, "grad_norm": 0.21783149705335722, "learning_rate": 2.9920634920634922e-05, "loss": 0.3588, "step": 647 }, { "epoch": 1.3875802997858673, "grad_norm": 0.274165710942849, "learning_rate": 2.9880952380952383e-05, "loss": 0.361, "step": 648 }, { "epoch": 1.3897216274089936, "grad_norm": 0.21073344703205954, "learning_rate": 2.9841269841269844e-05, "loss": 0.3497, "step": 649 }, { "epoch": 1.39186295503212, "grad_norm": 0.2501430083399487, "learning_rate": 2.98015873015873e-05, "loss": 0.3634, "step": 650 }, { "epoch": 1.3940042826552461, "grad_norm": 0.21899163049205936, "learning_rate": 2.9761904761904762e-05, "loss": 0.3679, "step": 651 }, { "epoch": 1.3961456102783725, "grad_norm": 0.294173480692159, "learning_rate": 2.9722222222222223e-05, "loss": 0.3826, "step": 652 }, { "epoch": 1.3982869379014988, "grad_norm": 0.2482472851407277, "learning_rate": 2.9682539682539683e-05, "loss": 0.3713, "step": 653 }, { "epoch": 1.4004282655246252, "grad_norm": 0.24006701265887465, "learning_rate": 2.9642857142857144e-05, "loss": 0.366, "step": 654 }, { "epoch": 1.4025695931477515, "grad_norm": 0.2591909670041966, "learning_rate": 2.9603174603174605e-05, "loss": 0.3623, "step": 655 }, { "epoch": 1.404710920770878, "grad_norm": 0.22731690936226825, "learning_rate": 2.9563492063492066e-05, "loss": 0.3691, "step": 656 }, { "epoch": 1.4068522483940042, "grad_norm": 0.2536340951226291, "learning_rate": 2.9523809523809526e-05, "loss": 0.3774, "step": 657 }, { "epoch": 1.4089935760171306, "grad_norm": 0.22286664711972803, "learning_rate": 2.9484126984126987e-05, "loss": 0.3743, "step": 658 }, { "epoch": 1.411134903640257, "grad_norm": 0.23183154595936484, "learning_rate": 2.9444444444444448e-05, "loss": 0.3598, "step": 659 }, { "epoch": 1.4132762312633833, "grad_norm": 0.25169102444114877, "learning_rate": 2.940476190476191e-05, "loss": 0.3589, "step": 660 }, { "epoch": 1.4154175588865097, "grad_norm": 0.21476384262074055, "learning_rate": 2.9365079365079366e-05, "loss": 0.3652, "step": 661 }, { "epoch": 1.417558886509636, "grad_norm": 0.25197060268479204, "learning_rate": 2.9325396825396827e-05, "loss": 0.3666, "step": 662 }, { "epoch": 1.4197002141327624, "grad_norm": 0.2673997115188798, "learning_rate": 2.9285714285714288e-05, "loss": 0.3671, "step": 663 }, { "epoch": 1.4218415417558887, "grad_norm": 0.23612940999561857, "learning_rate": 2.9246031746031748e-05, "loss": 0.3492, "step": 664 }, { "epoch": 1.423982869379015, "grad_norm": 0.2601602289574086, "learning_rate": 2.920634920634921e-05, "loss": 0.3581, "step": 665 }, { "epoch": 1.4261241970021414, "grad_norm": 0.2276622282515761, "learning_rate": 2.916666666666667e-05, "loss": 0.3625, "step": 666 }, { "epoch": 1.4282655246252678, "grad_norm": 0.2231777877927664, "learning_rate": 2.912698412698413e-05, "loss": 0.357, "step": 667 }, { "epoch": 1.430406852248394, "grad_norm": 0.21491218629534398, "learning_rate": 2.908730158730159e-05, "loss": 0.3659, "step": 668 }, { "epoch": 1.4325481798715203, "grad_norm": 0.2286321200334171, "learning_rate": 2.9047619047619052e-05, "loss": 0.3616, "step": 669 }, { "epoch": 1.4346895074946466, "grad_norm": 0.2431901099756692, "learning_rate": 2.9007936507936513e-05, "loss": 0.3574, "step": 670 }, { "epoch": 1.436830835117773, "grad_norm": 0.23060779470960058, "learning_rate": 2.8968253968253974e-05, "loss": 0.3584, "step": 671 }, { "epoch": 1.4389721627408993, "grad_norm": 0.21613457164084013, "learning_rate": 2.8928571428571434e-05, "loss": 0.3566, "step": 672 }, { "epoch": 1.4411134903640257, "grad_norm": 0.264503775266096, "learning_rate": 2.8888888888888888e-05, "loss": 0.3828, "step": 673 }, { "epoch": 1.443254817987152, "grad_norm": 0.23265546959620192, "learning_rate": 2.884920634920635e-05, "loss": 0.3633, "step": 674 }, { "epoch": 1.4453961456102784, "grad_norm": 0.23025174524649092, "learning_rate": 2.880952380952381e-05, "loss": 0.3676, "step": 675 }, { "epoch": 1.4475374732334048, "grad_norm": 0.21855417995132626, "learning_rate": 2.876984126984127e-05, "loss": 0.3683, "step": 676 }, { "epoch": 1.4496788008565311, "grad_norm": 0.22496528832364351, "learning_rate": 2.8730158730158728e-05, "loss": 0.3779, "step": 677 }, { "epoch": 1.4518201284796575, "grad_norm": 0.2244181509364692, "learning_rate": 2.869047619047619e-05, "loss": 0.3766, "step": 678 }, { "epoch": 1.4539614561027836, "grad_norm": 0.2079076247700585, "learning_rate": 2.865079365079365e-05, "loss": 0.3617, "step": 679 }, { "epoch": 1.45610278372591, "grad_norm": 0.23278584379048903, "learning_rate": 2.861111111111111e-05, "loss": 0.3613, "step": 680 }, { "epoch": 1.4582441113490363, "grad_norm": 0.22944194687094452, "learning_rate": 2.857142857142857e-05, "loss": 0.3514, "step": 681 }, { "epoch": 1.4603854389721627, "grad_norm": 0.24034672179425406, "learning_rate": 2.853174603174603e-05, "loss": 0.3723, "step": 682 }, { "epoch": 1.462526766595289, "grad_norm": 0.235937135297554, "learning_rate": 2.8492063492063492e-05, "loss": 0.3597, "step": 683 }, { "epoch": 1.4646680942184154, "grad_norm": 0.26577068348082256, "learning_rate": 2.8452380952380953e-05, "loss": 0.3839, "step": 684 }, { "epoch": 1.4668094218415417, "grad_norm": 0.22074582010953367, "learning_rate": 2.8412698412698414e-05, "loss": 0.3762, "step": 685 }, { "epoch": 1.468950749464668, "grad_norm": 0.28955823214995086, "learning_rate": 2.8373015873015875e-05, "loss": 0.3742, "step": 686 }, { "epoch": 1.4710920770877944, "grad_norm": 0.23728903924448824, "learning_rate": 2.8333333333333335e-05, "loss": 0.3682, "step": 687 }, { "epoch": 1.4732334047109208, "grad_norm": 0.21949914443063165, "learning_rate": 2.8293650793650793e-05, "loss": 0.3827, "step": 688 }, { "epoch": 1.4753747323340471, "grad_norm": 0.27980639261823936, "learning_rate": 2.8253968253968253e-05, "loss": 0.3525, "step": 689 }, { "epoch": 1.4775160599571735, "grad_norm": 0.2314438069257415, "learning_rate": 2.8214285714285714e-05, "loss": 0.3679, "step": 690 }, { "epoch": 1.4796573875802999, "grad_norm": 0.2722100613441337, "learning_rate": 2.8174603174603175e-05, "loss": 0.3751, "step": 691 }, { "epoch": 1.4817987152034262, "grad_norm": 0.2445277948976506, "learning_rate": 2.8134920634920636e-05, "loss": 0.378, "step": 692 }, { "epoch": 1.4839400428265526, "grad_norm": 0.2658539356677542, "learning_rate": 2.8095238095238096e-05, "loss": 0.3619, "step": 693 }, { "epoch": 1.486081370449679, "grad_norm": 0.26351984231630443, "learning_rate": 2.8055555555555557e-05, "loss": 0.3884, "step": 694 }, { "epoch": 1.4882226980728053, "grad_norm": 0.23831638328525492, "learning_rate": 2.8015873015873018e-05, "loss": 0.359, "step": 695 }, { "epoch": 1.4903640256959314, "grad_norm": 0.21588034310392898, "learning_rate": 2.797619047619048e-05, "loss": 0.3576, "step": 696 }, { "epoch": 1.4925053533190578, "grad_norm": 0.2814689991132766, "learning_rate": 2.793650793650794e-05, "loss": 0.3702, "step": 697 }, { "epoch": 1.4946466809421841, "grad_norm": 0.19677015269212336, "learning_rate": 2.78968253968254e-05, "loss": 0.3481, "step": 698 }, { "epoch": 1.4967880085653105, "grad_norm": 0.24694240495989817, "learning_rate": 2.785714285714286e-05, "loss": 0.3404, "step": 699 }, { "epoch": 1.4989293361884368, "grad_norm": 0.24560310196329904, "learning_rate": 2.781746031746032e-05, "loss": 0.3559, "step": 700 }, { "epoch": 1.5010706638115632, "grad_norm": 0.22770557903012514, "learning_rate": 2.777777777777778e-05, "loss": 0.3774, "step": 701 }, { "epoch": 1.5032119914346895, "grad_norm": 0.22247306696683353, "learning_rate": 2.773809523809524e-05, "loss": 0.3531, "step": 702 }, { "epoch": 1.5053533190578159, "grad_norm": 0.23940565806300032, "learning_rate": 2.76984126984127e-05, "loss": 0.3548, "step": 703 }, { "epoch": 1.507494646680942, "grad_norm": 0.2535843655330012, "learning_rate": 2.765873015873016e-05, "loss": 0.3657, "step": 704 }, { "epoch": 1.5096359743040684, "grad_norm": 0.23618580242395265, "learning_rate": 2.7619047619047622e-05, "loss": 0.3684, "step": 705 }, { "epoch": 1.5117773019271947, "grad_norm": 0.2548030311217275, "learning_rate": 2.7579365079365083e-05, "loss": 0.3575, "step": 706 }, { "epoch": 1.513918629550321, "grad_norm": 0.23669897400390896, "learning_rate": 2.7539682539682544e-05, "loss": 0.3678, "step": 707 }, { "epoch": 1.5160599571734474, "grad_norm": 0.22764730629262053, "learning_rate": 2.7500000000000004e-05, "loss": 0.3738, "step": 708 }, { "epoch": 1.5182012847965738, "grad_norm": 0.2522265156348486, "learning_rate": 2.7460317460317465e-05, "loss": 0.3752, "step": 709 }, { "epoch": 1.5203426124197001, "grad_norm": 0.22848735051444552, "learning_rate": 2.7420634920634926e-05, "loss": 0.3699, "step": 710 }, { "epoch": 1.5224839400428265, "grad_norm": 0.23087075847319702, "learning_rate": 2.7380952380952383e-05, "loss": 0.3539, "step": 711 }, { "epoch": 1.5246252676659529, "grad_norm": 0.22350228895743782, "learning_rate": 2.734126984126984e-05, "loss": 0.3513, "step": 712 }, { "epoch": 1.5267665952890792, "grad_norm": 0.23714590267506044, "learning_rate": 2.73015873015873e-05, "loss": 0.37, "step": 713 }, { "epoch": 1.5289079229122056, "grad_norm": 0.23601477099160736, "learning_rate": 2.7261904761904762e-05, "loss": 0.3654, "step": 714 }, { "epoch": 1.531049250535332, "grad_norm": 0.22820632242815328, "learning_rate": 2.7222222222222223e-05, "loss": 0.3643, "step": 715 }, { "epoch": 1.5331905781584583, "grad_norm": 0.2367592790064326, "learning_rate": 2.718253968253968e-05, "loss": 0.3588, "step": 716 }, { "epoch": 1.5353319057815846, "grad_norm": 0.2283679400061131, "learning_rate": 2.714285714285714e-05, "loss": 0.3698, "step": 717 }, { "epoch": 1.537473233404711, "grad_norm": 0.23687279355617852, "learning_rate": 2.7103174603174602e-05, "loss": 0.3796, "step": 718 }, { "epoch": 1.5396145610278373, "grad_norm": 0.23367650341137466, "learning_rate": 2.7063492063492062e-05, "loss": 0.3817, "step": 719 }, { "epoch": 1.5417558886509637, "grad_norm": 0.2314615253573604, "learning_rate": 2.7023809523809523e-05, "loss": 0.3656, "step": 720 }, { "epoch": 1.54389721627409, "grad_norm": 0.24177942634670613, "learning_rate": 2.6984126984126984e-05, "loss": 0.3639, "step": 721 }, { "epoch": 1.5460385438972164, "grad_norm": 0.21932445481704804, "learning_rate": 2.6944444444444445e-05, "loss": 0.3702, "step": 722 }, { "epoch": 1.5481798715203428, "grad_norm": 0.24065414879821986, "learning_rate": 2.6904761904761905e-05, "loss": 0.3885, "step": 723 }, { "epoch": 1.550321199143469, "grad_norm": 0.20826666740635788, "learning_rate": 2.6865079365079366e-05, "loss": 0.372, "step": 724 }, { "epoch": 1.5524625267665952, "grad_norm": 0.22699790366079173, "learning_rate": 2.6825396825396827e-05, "loss": 0.3529, "step": 725 }, { "epoch": 1.5546038543897216, "grad_norm": 0.23143139316201608, "learning_rate": 2.6785714285714288e-05, "loss": 0.3873, "step": 726 }, { "epoch": 1.556745182012848, "grad_norm": 0.23566599378296404, "learning_rate": 2.6746031746031745e-05, "loss": 0.3613, "step": 727 }, { "epoch": 1.5588865096359743, "grad_norm": 0.2431611538451522, "learning_rate": 2.6706349206349206e-05, "loss": 0.3564, "step": 728 }, { "epoch": 1.5610278372591007, "grad_norm": 0.21557222183068245, "learning_rate": 2.6666666666666667e-05, "loss": 0.3519, "step": 729 }, { "epoch": 1.563169164882227, "grad_norm": 0.21761967858926073, "learning_rate": 2.6626984126984127e-05, "loss": 0.3676, "step": 730 }, { "epoch": 1.5653104925053534, "grad_norm": 0.2067924331453489, "learning_rate": 2.6587301587301588e-05, "loss": 0.357, "step": 731 }, { "epoch": 1.5674518201284795, "grad_norm": 0.21006075912880307, "learning_rate": 2.654761904761905e-05, "loss": 0.3545, "step": 732 }, { "epoch": 1.5695931477516059, "grad_norm": 0.23470822509906164, "learning_rate": 2.650793650793651e-05, "loss": 0.3698, "step": 733 }, { "epoch": 1.5717344753747322, "grad_norm": 0.19926495187855447, "learning_rate": 2.646825396825397e-05, "loss": 0.3622, "step": 734 }, { "epoch": 1.5738758029978586, "grad_norm": 0.24016060928887698, "learning_rate": 2.642857142857143e-05, "loss": 0.3582, "step": 735 }, { "epoch": 1.576017130620985, "grad_norm": 0.22487176055383068, "learning_rate": 2.6388888888888892e-05, "loss": 0.386, "step": 736 }, { "epoch": 1.5781584582441113, "grad_norm": 0.21079271486663667, "learning_rate": 2.6349206349206353e-05, "loss": 0.3623, "step": 737 }, { "epoch": 1.5802997858672376, "grad_norm": 0.2334507646133209, "learning_rate": 2.6309523809523813e-05, "loss": 0.3762, "step": 738 }, { "epoch": 1.582441113490364, "grad_norm": 0.19667313653775484, "learning_rate": 2.626984126984127e-05, "loss": 0.3474, "step": 739 }, { "epoch": 1.5845824411134903, "grad_norm": 0.21267970407005282, "learning_rate": 2.623015873015873e-05, "loss": 0.3582, "step": 740 }, { "epoch": 1.5867237687366167, "grad_norm": 0.19547706247312066, "learning_rate": 2.6190476190476192e-05, "loss": 0.364, "step": 741 }, { "epoch": 1.588865096359743, "grad_norm": 0.22715519405850873, "learning_rate": 2.6150793650793653e-05, "loss": 0.3589, "step": 742 }, { "epoch": 1.5910064239828694, "grad_norm": 0.2248474130605718, "learning_rate": 2.6111111111111114e-05, "loss": 0.3658, "step": 743 }, { "epoch": 1.5931477516059958, "grad_norm": 0.207264130106897, "learning_rate": 2.6071428571428574e-05, "loss": 0.3677, "step": 744 }, { "epoch": 1.595289079229122, "grad_norm": 0.20684564047898432, "learning_rate": 2.6031746031746035e-05, "loss": 0.3696, "step": 745 }, { "epoch": 1.5974304068522485, "grad_norm": 0.20367525386940707, "learning_rate": 2.5992063492063496e-05, "loss": 0.3481, "step": 746 }, { "epoch": 1.5995717344753748, "grad_norm": 0.20679426091923925, "learning_rate": 2.5952380952380957e-05, "loss": 0.3701, "step": 747 }, { "epoch": 1.6017130620985012, "grad_norm": 0.20915864152813907, "learning_rate": 2.5912698412698417e-05, "loss": 0.3633, "step": 748 }, { "epoch": 1.6038543897216275, "grad_norm": 0.2234637695145891, "learning_rate": 2.5873015873015878e-05, "loss": 0.3579, "step": 749 }, { "epoch": 1.6059957173447539, "grad_norm": 0.23358262127655052, "learning_rate": 2.5833333333333336e-05, "loss": 0.3823, "step": 750 }, { "epoch": 1.6081370449678802, "grad_norm": 0.22541931799902007, "learning_rate": 2.5793650793650796e-05, "loss": 0.3651, "step": 751 }, { "epoch": 1.6102783725910066, "grad_norm": 0.22458604036006147, "learning_rate": 2.5753968253968254e-05, "loss": 0.3672, "step": 752 }, { "epoch": 1.6124197002141327, "grad_norm": 0.19850956726240754, "learning_rate": 2.5714285714285714e-05, "loss": 0.3495, "step": 753 }, { "epoch": 1.614561027837259, "grad_norm": 0.22459012390393387, "learning_rate": 2.5674603174603172e-05, "loss": 0.3615, "step": 754 }, { "epoch": 1.6167023554603854, "grad_norm": 0.22306811697664986, "learning_rate": 2.5634920634920633e-05, "loss": 0.3629, "step": 755 }, { "epoch": 1.6188436830835118, "grad_norm": 0.22471296174548994, "learning_rate": 2.5595238095238093e-05, "loss": 0.3764, "step": 756 }, { "epoch": 1.6209850107066381, "grad_norm": 0.21158605873651065, "learning_rate": 2.5555555555555554e-05, "loss": 0.3653, "step": 757 }, { "epoch": 1.6231263383297645, "grad_norm": 0.21671080824519806, "learning_rate": 2.5515873015873015e-05, "loss": 0.371, "step": 758 }, { "epoch": 1.6252676659528906, "grad_norm": 0.2356438023908563, "learning_rate": 2.5476190476190476e-05, "loss": 0.364, "step": 759 }, { "epoch": 1.627408993576017, "grad_norm": 0.19447430345844002, "learning_rate": 2.5436507936507936e-05, "loss": 0.3368, "step": 760 }, { "epoch": 1.6295503211991433, "grad_norm": 0.22994645655583826, "learning_rate": 2.5396825396825397e-05, "loss": 0.3567, "step": 761 }, { "epoch": 1.6316916488222697, "grad_norm": 0.23001951683149954, "learning_rate": 2.5357142857142858e-05, "loss": 0.3684, "step": 762 }, { "epoch": 1.633832976445396, "grad_norm": 0.2119578315059528, "learning_rate": 2.531746031746032e-05, "loss": 0.3536, "step": 763 }, { "epoch": 1.6359743040685224, "grad_norm": 0.2615589609533618, "learning_rate": 2.527777777777778e-05, "loss": 0.389, "step": 764 }, { "epoch": 1.6381156316916488, "grad_norm": 0.22688862347727404, "learning_rate": 2.523809523809524e-05, "loss": 0.3659, "step": 765 }, { "epoch": 1.640256959314775, "grad_norm": 0.22643676902104382, "learning_rate": 2.5198412698412697e-05, "loss": 0.3698, "step": 766 }, { "epoch": 1.6423982869379015, "grad_norm": 0.21169667439175707, "learning_rate": 2.5158730158730158e-05, "loss": 0.3762, "step": 767 }, { "epoch": 1.6445396145610278, "grad_norm": 0.22932111037643213, "learning_rate": 2.511904761904762e-05, "loss": 0.367, "step": 768 }, { "epoch": 1.6466809421841542, "grad_norm": 0.21149524956469937, "learning_rate": 2.507936507936508e-05, "loss": 0.3764, "step": 769 }, { "epoch": 1.6488222698072805, "grad_norm": 0.22593475888288625, "learning_rate": 2.503968253968254e-05, "loss": 0.3651, "step": 770 }, { "epoch": 1.6509635974304069, "grad_norm": 0.22008417552751885, "learning_rate": 2.5e-05, "loss": 0.377, "step": 771 }, { "epoch": 1.6531049250535332, "grad_norm": 0.20441837816311934, "learning_rate": 2.4960317460317462e-05, "loss": 0.3716, "step": 772 }, { "epoch": 1.6552462526766596, "grad_norm": 0.21552942260647906, "learning_rate": 2.4920634920634923e-05, "loss": 0.3385, "step": 773 }, { "epoch": 1.657387580299786, "grad_norm": 0.2353507438436094, "learning_rate": 2.4880952380952383e-05, "loss": 0.3864, "step": 774 }, { "epoch": 1.6595289079229123, "grad_norm": 0.21413713760888206, "learning_rate": 2.4841269841269844e-05, "loss": 0.3647, "step": 775 }, { "epoch": 1.6616702355460387, "grad_norm": 0.2160670298547777, "learning_rate": 2.4801587301587305e-05, "loss": 0.3868, "step": 776 }, { "epoch": 1.663811563169165, "grad_norm": 0.21929322311304728, "learning_rate": 2.4761904761904762e-05, "loss": 0.3739, "step": 777 }, { "epoch": 1.6659528907922914, "grad_norm": 0.22788233645299552, "learning_rate": 2.4722222222222223e-05, "loss": 0.373, "step": 778 }, { "epoch": 1.6680942184154177, "grad_norm": 0.210338880481676, "learning_rate": 2.4682539682539684e-05, "loss": 0.3713, "step": 779 }, { "epoch": 1.6702355460385439, "grad_norm": 0.22350067161462142, "learning_rate": 2.4642857142857145e-05, "loss": 0.3664, "step": 780 }, { "epoch": 1.6723768736616702, "grad_norm": 0.22340962121009117, "learning_rate": 2.4603174603174602e-05, "loss": 0.3881, "step": 781 }, { "epoch": 1.6745182012847966, "grad_norm": 0.1990507179838242, "learning_rate": 2.4563492063492063e-05, "loss": 0.3746, "step": 782 }, { "epoch": 1.676659528907923, "grad_norm": 0.22318485815266106, "learning_rate": 2.4523809523809523e-05, "loss": 0.353, "step": 783 }, { "epoch": 1.6788008565310493, "grad_norm": 0.22041375294909685, "learning_rate": 2.4484126984126984e-05, "loss": 0.3724, "step": 784 }, { "epoch": 1.6809421841541756, "grad_norm": 0.21498770171477544, "learning_rate": 2.4444444444444445e-05, "loss": 0.3819, "step": 785 }, { "epoch": 1.683083511777302, "grad_norm": 0.23245272113804277, "learning_rate": 2.4404761904761906e-05, "loss": 0.3632, "step": 786 }, { "epoch": 1.685224839400428, "grad_norm": 0.2249931267054549, "learning_rate": 2.4365079365079366e-05, "loss": 0.3639, "step": 787 }, { "epoch": 1.6873661670235545, "grad_norm": 0.21799548712885633, "learning_rate": 2.4325396825396827e-05, "loss": 0.3581, "step": 788 }, { "epoch": 1.6895074946466808, "grad_norm": 0.22018263174045988, "learning_rate": 2.4285714285714288e-05, "loss": 0.3612, "step": 789 }, { "epoch": 1.6916488222698072, "grad_norm": 0.20033740087548269, "learning_rate": 2.424603174603175e-05, "loss": 0.3313, "step": 790 }, { "epoch": 1.6937901498929335, "grad_norm": 0.19788624340907016, "learning_rate": 2.4206349206349206e-05, "loss": 0.345, "step": 791 }, { "epoch": 1.6959314775160599, "grad_norm": 0.21154589560113807, "learning_rate": 2.4166666666666667e-05, "loss": 0.3448, "step": 792 }, { "epoch": 1.6980728051391862, "grad_norm": 0.21812600023361592, "learning_rate": 2.4126984126984128e-05, "loss": 0.3506, "step": 793 }, { "epoch": 1.7002141327623126, "grad_norm": 0.21274687742218662, "learning_rate": 2.408730158730159e-05, "loss": 0.3602, "step": 794 }, { "epoch": 1.702355460385439, "grad_norm": 0.20945076032569784, "learning_rate": 2.404761904761905e-05, "loss": 0.3609, "step": 795 }, { "epoch": 1.7044967880085653, "grad_norm": 0.22319883020181144, "learning_rate": 2.400793650793651e-05, "loss": 0.3707, "step": 796 }, { "epoch": 1.7066381156316917, "grad_norm": 0.23376387552204406, "learning_rate": 2.396825396825397e-05, "loss": 0.3777, "step": 797 }, { "epoch": 1.708779443254818, "grad_norm": 0.21375479088785895, "learning_rate": 2.392857142857143e-05, "loss": 0.3637, "step": 798 }, { "epoch": 1.7109207708779444, "grad_norm": 0.20013544930852412, "learning_rate": 2.3888888888888892e-05, "loss": 0.3662, "step": 799 }, { "epoch": 1.7130620985010707, "grad_norm": 0.23771022394213084, "learning_rate": 2.3849206349206353e-05, "loss": 0.3541, "step": 800 }, { "epoch": 1.715203426124197, "grad_norm": 0.2125013216241467, "learning_rate": 2.380952380952381e-05, "loss": 0.3586, "step": 801 }, { "epoch": 1.7173447537473234, "grad_norm": 0.24611181377977226, "learning_rate": 2.376984126984127e-05, "loss": 0.3765, "step": 802 }, { "epoch": 1.7194860813704498, "grad_norm": 0.2327060873577735, "learning_rate": 2.373015873015873e-05, "loss": 0.3533, "step": 803 }, { "epoch": 1.7216274089935761, "grad_norm": 0.22138909685445068, "learning_rate": 2.369047619047619e-05, "loss": 0.3583, "step": 804 }, { "epoch": 1.7237687366167025, "grad_norm": 0.208629956972268, "learning_rate": 2.365079365079365e-05, "loss": 0.3755, "step": 805 }, { "epoch": 1.7259100642398288, "grad_norm": 0.23858026530704865, "learning_rate": 2.361111111111111e-05, "loss": 0.3694, "step": 806 }, { "epoch": 1.728051391862955, "grad_norm": 0.21718501714953736, "learning_rate": 2.357142857142857e-05, "loss": 0.353, "step": 807 }, { "epoch": 1.7301927194860813, "grad_norm": 0.1938804380949089, "learning_rate": 2.3531746031746032e-05, "loss": 0.3474, "step": 808 }, { "epoch": 1.7323340471092077, "grad_norm": 0.21740566223114047, "learning_rate": 2.3492063492063493e-05, "loss": 0.3754, "step": 809 }, { "epoch": 1.734475374732334, "grad_norm": 0.2142359822570361, "learning_rate": 2.3452380952380954e-05, "loss": 0.358, "step": 810 }, { "epoch": 1.7366167023554604, "grad_norm": 0.2070519093899908, "learning_rate": 2.3412698412698414e-05, "loss": 0.3648, "step": 811 }, { "epoch": 1.7387580299785867, "grad_norm": 0.1994890470093942, "learning_rate": 2.3373015873015875e-05, "loss": 0.3721, "step": 812 }, { "epoch": 1.740899357601713, "grad_norm": 0.19930905538311752, "learning_rate": 2.3333333333333336e-05, "loss": 0.3634, "step": 813 }, { "epoch": 1.7430406852248392, "grad_norm": 0.22943112463202014, "learning_rate": 2.3293650793650797e-05, "loss": 0.3674, "step": 814 }, { "epoch": 1.7451820128479656, "grad_norm": 0.19595373289639564, "learning_rate": 2.3253968253968257e-05, "loss": 0.3826, "step": 815 }, { "epoch": 1.747323340471092, "grad_norm": 0.23295233026294598, "learning_rate": 2.3214285714285715e-05, "loss": 0.3656, "step": 816 }, { "epoch": 1.7494646680942183, "grad_norm": 0.21540081438261183, "learning_rate": 2.3174603174603175e-05, "loss": 0.3829, "step": 817 }, { "epoch": 1.7516059957173447, "grad_norm": 0.1864879412189447, "learning_rate": 2.3134920634920636e-05, "loss": 0.3554, "step": 818 }, { "epoch": 1.753747323340471, "grad_norm": 0.2387089230839924, "learning_rate": 2.3095238095238097e-05, "loss": 0.385, "step": 819 }, { "epoch": 1.7558886509635974, "grad_norm": 0.2232438395168808, "learning_rate": 2.3055555555555558e-05, "loss": 0.3504, "step": 820 }, { "epoch": 1.7580299785867237, "grad_norm": 0.20415844428245095, "learning_rate": 2.3015873015873015e-05, "loss": 0.3789, "step": 821 }, { "epoch": 1.76017130620985, "grad_norm": 0.19921172666296888, "learning_rate": 2.2976190476190476e-05, "loss": 0.3701, "step": 822 }, { "epoch": 1.7623126338329764, "grad_norm": 0.22470908559459463, "learning_rate": 2.2936507936507937e-05, "loss": 0.3619, "step": 823 }, { "epoch": 1.7644539614561028, "grad_norm": 0.1959331611395145, "learning_rate": 2.2896825396825397e-05, "loss": 0.343, "step": 824 }, { "epoch": 1.7665952890792291, "grad_norm": 0.21307394355566658, "learning_rate": 2.2857142857142858e-05, "loss": 0.3546, "step": 825 }, { "epoch": 1.7687366167023555, "grad_norm": 0.20578224521899074, "learning_rate": 2.281746031746032e-05, "loss": 0.3795, "step": 826 }, { "epoch": 1.7708779443254818, "grad_norm": 0.2189247535460949, "learning_rate": 2.277777777777778e-05, "loss": 0.3607, "step": 827 }, { "epoch": 1.7730192719486082, "grad_norm": 0.21596310355670684, "learning_rate": 2.273809523809524e-05, "loss": 0.3584, "step": 828 }, { "epoch": 1.7751605995717346, "grad_norm": 0.2008393875425573, "learning_rate": 2.2698412698412698e-05, "loss": 0.3647, "step": 829 }, { "epoch": 1.777301927194861, "grad_norm": 0.21184243202841968, "learning_rate": 2.265873015873016e-05, "loss": 0.3455, "step": 830 }, { "epoch": 1.7794432548179873, "grad_norm": 0.22865192619400915, "learning_rate": 2.261904761904762e-05, "loss": 0.3901, "step": 831 }, { "epoch": 1.7815845824411136, "grad_norm": 0.22618545858724132, "learning_rate": 2.257936507936508e-05, "loss": 0.3832, "step": 832 }, { "epoch": 1.78372591006424, "grad_norm": 0.22738066487492137, "learning_rate": 2.253968253968254e-05, "loss": 0.3399, "step": 833 }, { "epoch": 1.7858672376873663, "grad_norm": 0.23621094995588868, "learning_rate": 2.25e-05, "loss": 0.3737, "step": 834 }, { "epoch": 1.7880085653104925, "grad_norm": 0.2157005750309211, "learning_rate": 2.2460317460317462e-05, "loss": 0.3556, "step": 835 }, { "epoch": 1.7901498929336188, "grad_norm": 0.20967061124515987, "learning_rate": 2.2420634920634923e-05, "loss": 0.3711, "step": 836 }, { "epoch": 1.7922912205567452, "grad_norm": 0.21011640159026213, "learning_rate": 2.2380952380952384e-05, "loss": 0.3696, "step": 837 }, { "epoch": 1.7944325481798715, "grad_norm": 0.2301543332161994, "learning_rate": 2.2341269841269844e-05, "loss": 0.3619, "step": 838 }, { "epoch": 1.7965738758029979, "grad_norm": 0.19783929504773992, "learning_rate": 2.2301587301587305e-05, "loss": 0.3711, "step": 839 }, { "epoch": 1.7987152034261242, "grad_norm": 0.2027645971091363, "learning_rate": 2.2261904761904763e-05, "loss": 0.3575, "step": 840 }, { "epoch": 1.8008565310492506, "grad_norm": 0.21010286682467308, "learning_rate": 2.2222222222222223e-05, "loss": 0.3684, "step": 841 }, { "epoch": 1.8029978586723767, "grad_norm": 0.1962020508553967, "learning_rate": 2.2182539682539684e-05, "loss": 0.3651, "step": 842 }, { "epoch": 1.805139186295503, "grad_norm": 0.20781819889894468, "learning_rate": 2.214285714285714e-05, "loss": 0.365, "step": 843 }, { "epoch": 1.8072805139186294, "grad_norm": 0.19696845772215546, "learning_rate": 2.2103174603174602e-05, "loss": 0.3629, "step": 844 }, { "epoch": 1.8094218415417558, "grad_norm": 0.21583117590285078, "learning_rate": 2.2063492063492063e-05, "loss": 0.3617, "step": 845 }, { "epoch": 1.8115631691648821, "grad_norm": 0.2173603569007904, "learning_rate": 2.2023809523809524e-05, "loss": 0.3734, "step": 846 }, { "epoch": 1.8137044967880085, "grad_norm": 0.2022970217705777, "learning_rate": 2.1984126984126984e-05, "loss": 0.3659, "step": 847 }, { "epoch": 1.8158458244111348, "grad_norm": 0.24139485756064838, "learning_rate": 2.1944444444444445e-05, "loss": 0.3644, "step": 848 }, { "epoch": 1.8179871520342612, "grad_norm": 0.216967803177166, "learning_rate": 2.1904761904761906e-05, "loss": 0.3625, "step": 849 }, { "epoch": 1.8201284796573876, "grad_norm": 0.21018593632480406, "learning_rate": 2.1865079365079367e-05, "loss": 0.3674, "step": 850 }, { "epoch": 1.822269807280514, "grad_norm": 0.21491326859755341, "learning_rate": 2.1825396825396827e-05, "loss": 0.376, "step": 851 }, { "epoch": 1.8244111349036403, "grad_norm": 0.2016970958055073, "learning_rate": 2.1785714285714288e-05, "loss": 0.3682, "step": 852 }, { "epoch": 1.8265524625267666, "grad_norm": 0.23649936879215913, "learning_rate": 2.174603174603175e-05, "loss": 0.3671, "step": 853 }, { "epoch": 1.828693790149893, "grad_norm": 0.20337632262549168, "learning_rate": 2.170634920634921e-05, "loss": 0.3461, "step": 854 }, { "epoch": 1.8308351177730193, "grad_norm": 0.21300952050085178, "learning_rate": 2.1666666666666667e-05, "loss": 0.3542, "step": 855 }, { "epoch": 1.8329764453961457, "grad_norm": 0.21370860755844873, "learning_rate": 2.1626984126984128e-05, "loss": 0.3485, "step": 856 }, { "epoch": 1.835117773019272, "grad_norm": 0.2238345205767126, "learning_rate": 2.158730158730159e-05, "loss": 0.3538, "step": 857 }, { "epoch": 1.8372591006423984, "grad_norm": 0.20314755234997184, "learning_rate": 2.154761904761905e-05, "loss": 0.3618, "step": 858 }, { "epoch": 1.8394004282655247, "grad_norm": 0.19823141224145824, "learning_rate": 2.150793650793651e-05, "loss": 0.3496, "step": 859 }, { "epoch": 1.841541755888651, "grad_norm": 0.19176641722853272, "learning_rate": 2.1468253968253967e-05, "loss": 0.3712, "step": 860 }, { "epoch": 1.8436830835117775, "grad_norm": 0.20478508554466868, "learning_rate": 2.1428571428571428e-05, "loss": 0.3548, "step": 861 }, { "epoch": 1.8458244111349036, "grad_norm": 0.19864762266874297, "learning_rate": 2.138888888888889e-05, "loss": 0.3616, "step": 862 }, { "epoch": 1.84796573875803, "grad_norm": 0.20776884101667364, "learning_rate": 2.134920634920635e-05, "loss": 0.3674, "step": 863 }, { "epoch": 1.8501070663811563, "grad_norm": 0.20958140144946166, "learning_rate": 2.130952380952381e-05, "loss": 0.3681, "step": 864 }, { "epoch": 1.8522483940042827, "grad_norm": 0.1866017449921812, "learning_rate": 2.126984126984127e-05, "loss": 0.3509, "step": 865 }, { "epoch": 1.854389721627409, "grad_norm": 0.1893458266122647, "learning_rate": 2.1230158730158732e-05, "loss": 0.3508, "step": 866 }, { "epoch": 1.8565310492505354, "grad_norm": 0.22008473964117356, "learning_rate": 2.1190476190476193e-05, "loss": 0.3678, "step": 867 }, { "epoch": 1.8586723768736617, "grad_norm": 0.1936262836818084, "learning_rate": 2.115079365079365e-05, "loss": 0.3507, "step": 868 }, { "epoch": 1.8608137044967878, "grad_norm": 0.19939279008943295, "learning_rate": 2.111111111111111e-05, "loss": 0.3606, "step": 869 }, { "epoch": 1.8629550321199142, "grad_norm": 0.20666085743177595, "learning_rate": 2.107142857142857e-05, "loss": 0.3564, "step": 870 }, { "epoch": 1.8650963597430406, "grad_norm": 0.22692963345797962, "learning_rate": 2.1031746031746032e-05, "loss": 0.3512, "step": 871 }, { "epoch": 1.867237687366167, "grad_norm": 0.21239492391629777, "learning_rate": 2.0992063492063493e-05, "loss": 0.3728, "step": 872 }, { "epoch": 1.8693790149892933, "grad_norm": 0.1984911807328775, "learning_rate": 2.0952380952380954e-05, "loss": 0.3651, "step": 873 }, { "epoch": 1.8715203426124196, "grad_norm": 0.2035710275475753, "learning_rate": 2.0912698412698415e-05, "loss": 0.3566, "step": 874 }, { "epoch": 1.873661670235546, "grad_norm": 0.21183590394095844, "learning_rate": 2.0873015873015875e-05, "loss": 0.3544, "step": 875 }, { "epoch": 1.8758029978586723, "grad_norm": 0.20710933660481542, "learning_rate": 2.0833333333333336e-05, "loss": 0.3526, "step": 876 }, { "epoch": 1.8779443254817987, "grad_norm": 0.20824572300787447, "learning_rate": 2.0793650793650797e-05, "loss": 0.3429, "step": 877 }, { "epoch": 1.880085653104925, "grad_norm": 0.21277139237572662, "learning_rate": 2.0753968253968258e-05, "loss": 0.3647, "step": 878 }, { "epoch": 1.8822269807280514, "grad_norm": 0.20675962730393843, "learning_rate": 2.0714285714285718e-05, "loss": 0.3656, "step": 879 }, { "epoch": 1.8843683083511777, "grad_norm": 0.2578061912750937, "learning_rate": 2.0674603174603176e-05, "loss": 0.3533, "step": 880 }, { "epoch": 1.886509635974304, "grad_norm": 0.20710356677023856, "learning_rate": 2.0634920634920636e-05, "loss": 0.3609, "step": 881 }, { "epoch": 1.8886509635974305, "grad_norm": 0.20736571003231136, "learning_rate": 2.0595238095238094e-05, "loss": 0.3518, "step": 882 }, { "epoch": 1.8907922912205568, "grad_norm": 0.219731020163135, "learning_rate": 2.0555555555555555e-05, "loss": 0.3507, "step": 883 }, { "epoch": 1.8929336188436832, "grad_norm": 0.21068978476088052, "learning_rate": 2.0515873015873015e-05, "loss": 0.3499, "step": 884 }, { "epoch": 1.8950749464668095, "grad_norm": 0.2585013587777131, "learning_rate": 2.0476190476190476e-05, "loss": 0.3627, "step": 885 }, { "epoch": 1.8972162740899359, "grad_norm": 0.19255613092161056, "learning_rate": 2.0436507936507937e-05, "loss": 0.3547, "step": 886 }, { "epoch": 1.8993576017130622, "grad_norm": 0.21046046883766856, "learning_rate": 2.0396825396825398e-05, "loss": 0.3688, "step": 887 }, { "epoch": 1.9014989293361886, "grad_norm": 0.2031259267946657, "learning_rate": 2.0357142857142858e-05, "loss": 0.366, "step": 888 }, { "epoch": 1.903640256959315, "grad_norm": 0.22385718639419275, "learning_rate": 2.031746031746032e-05, "loss": 0.366, "step": 889 }, { "epoch": 1.905781584582441, "grad_norm": 0.20168399193948486, "learning_rate": 2.027777777777778e-05, "loss": 0.3562, "step": 890 }, { "epoch": 1.9079229122055674, "grad_norm": 0.19396525152354396, "learning_rate": 2.023809523809524e-05, "loss": 0.356, "step": 891 }, { "epoch": 1.9100642398286938, "grad_norm": 0.20798336659145256, "learning_rate": 2.01984126984127e-05, "loss": 0.3438, "step": 892 }, { "epoch": 1.9122055674518201, "grad_norm": 0.2135461327301388, "learning_rate": 2.015873015873016e-05, "loss": 0.3556, "step": 893 }, { "epoch": 1.9143468950749465, "grad_norm": 0.20025569788736822, "learning_rate": 2.011904761904762e-05, "loss": 0.3522, "step": 894 }, { "epoch": 1.9164882226980728, "grad_norm": 0.21073141685921978, "learning_rate": 2.007936507936508e-05, "loss": 0.3669, "step": 895 }, { "epoch": 1.9186295503211992, "grad_norm": 0.20550729295532494, "learning_rate": 2.003968253968254e-05, "loss": 0.3563, "step": 896 }, { "epoch": 1.9207708779443253, "grad_norm": 0.2219289886669546, "learning_rate": 2e-05, "loss": 0.3632, "step": 897 }, { "epoch": 1.9229122055674517, "grad_norm": 0.20426747560999273, "learning_rate": 1.9960317460317462e-05, "loss": 0.3731, "step": 898 }, { "epoch": 1.925053533190578, "grad_norm": 0.21309608447398956, "learning_rate": 1.992063492063492e-05, "loss": 0.3822, "step": 899 }, { "epoch": 1.9271948608137044, "grad_norm": 0.2047630612588562, "learning_rate": 1.988095238095238e-05, "loss": 0.3826, "step": 900 }, { "epoch": 1.9293361884368307, "grad_norm": 0.20339744126965478, "learning_rate": 1.984126984126984e-05, "loss": 0.3635, "step": 901 }, { "epoch": 1.931477516059957, "grad_norm": 0.19470480788266056, "learning_rate": 1.9801587301587302e-05, "loss": 0.3444, "step": 902 }, { "epoch": 1.9336188436830835, "grad_norm": 0.22460440732783768, "learning_rate": 1.9761904761904763e-05, "loss": 0.379, "step": 903 }, { "epoch": 1.9357601713062098, "grad_norm": 0.22213495508308273, "learning_rate": 1.9722222222222224e-05, "loss": 0.3676, "step": 904 }, { "epoch": 1.9379014989293362, "grad_norm": 0.20343330118385294, "learning_rate": 1.9682539682539684e-05, "loss": 0.3546, "step": 905 }, { "epoch": 1.9400428265524625, "grad_norm": 0.19101512005815446, "learning_rate": 1.9642857142857145e-05, "loss": 0.3545, "step": 906 }, { "epoch": 1.9421841541755889, "grad_norm": 0.27419563179542383, "learning_rate": 1.9603174603174602e-05, "loss": 0.339, "step": 907 }, { "epoch": 1.9443254817987152, "grad_norm": 0.20460100369509523, "learning_rate": 1.9563492063492063e-05, "loss": 0.3713, "step": 908 }, { "epoch": 1.9464668094218416, "grad_norm": 0.21726477076370293, "learning_rate": 1.9523809523809524e-05, "loss": 0.3674, "step": 909 }, { "epoch": 1.948608137044968, "grad_norm": 0.2366081598522377, "learning_rate": 1.9484126984126985e-05, "loss": 0.3716, "step": 910 }, { "epoch": 1.9507494646680943, "grad_norm": 0.2309915328014547, "learning_rate": 1.9444444444444445e-05, "loss": 0.3777, "step": 911 }, { "epoch": 1.9528907922912206, "grad_norm": 0.21457199704732352, "learning_rate": 1.9404761904761906e-05, "loss": 0.3519, "step": 912 }, { "epoch": 1.955032119914347, "grad_norm": 0.21421731139535555, "learning_rate": 1.9365079365079367e-05, "loss": 0.3595, "step": 913 }, { "epoch": 1.9571734475374734, "grad_norm": 0.2227014663977621, "learning_rate": 1.9325396825396828e-05, "loss": 0.3618, "step": 914 }, { "epoch": 1.9593147751605997, "grad_norm": 0.19983383858352952, "learning_rate": 1.928571428571429e-05, "loss": 0.3566, "step": 915 }, { "epoch": 1.961456102783726, "grad_norm": 0.19358450153214074, "learning_rate": 1.924603174603175e-05, "loss": 0.3456, "step": 916 }, { "epoch": 1.9635974304068522, "grad_norm": 0.19694776679615417, "learning_rate": 1.920634920634921e-05, "loss": 0.3601, "step": 917 }, { "epoch": 1.9657387580299786, "grad_norm": 0.20280005475553886, "learning_rate": 1.9166666666666667e-05, "loss": 0.3537, "step": 918 }, { "epoch": 1.967880085653105, "grad_norm": 0.19909912132770102, "learning_rate": 1.9126984126984128e-05, "loss": 0.3579, "step": 919 }, { "epoch": 1.9700214132762313, "grad_norm": 0.22713082416817215, "learning_rate": 1.9087301587301585e-05, "loss": 0.3768, "step": 920 }, { "epoch": 1.9721627408993576, "grad_norm": 0.211019913621537, "learning_rate": 1.9047619047619046e-05, "loss": 0.3541, "step": 921 }, { "epoch": 1.974304068522484, "grad_norm": 0.20335978017528178, "learning_rate": 1.9007936507936507e-05, "loss": 0.3715, "step": 922 }, { "epoch": 1.9764453961456103, "grad_norm": 0.2138102480162949, "learning_rate": 1.8968253968253968e-05, "loss": 0.3591, "step": 923 }, { "epoch": 1.9785867237687365, "grad_norm": 0.24453013877035817, "learning_rate": 1.892857142857143e-05, "loss": 0.3853, "step": 924 }, { "epoch": 1.9807280513918628, "grad_norm": 0.20138123670876026, "learning_rate": 1.888888888888889e-05, "loss": 0.3693, "step": 925 }, { "epoch": 1.9828693790149892, "grad_norm": 0.2130228707007481, "learning_rate": 1.884920634920635e-05, "loss": 0.3756, "step": 926 }, { "epoch": 1.9850107066381155, "grad_norm": 0.21159991261313507, "learning_rate": 1.880952380952381e-05, "loss": 0.3559, "step": 927 }, { "epoch": 1.9871520342612419, "grad_norm": 0.1926680844736931, "learning_rate": 1.876984126984127e-05, "loss": 0.3564, "step": 928 }, { "epoch": 1.9892933618843682, "grad_norm": 0.19459069355708655, "learning_rate": 1.8730158730158732e-05, "loss": 0.3496, "step": 929 }, { "epoch": 1.9914346895074946, "grad_norm": 0.2133220689283602, "learning_rate": 1.8690476190476193e-05, "loss": 0.3651, "step": 930 }, { "epoch": 1.993576017130621, "grad_norm": 0.2027554437180913, "learning_rate": 1.8650793650793654e-05, "loss": 0.3695, "step": 931 }, { "epoch": 1.9957173447537473, "grad_norm": 0.19702179225284736, "learning_rate": 1.861111111111111e-05, "loss": 0.3659, "step": 932 }, { "epoch": 1.9978586723768736, "grad_norm": 0.21240087364498741, "learning_rate": 1.8571428571428572e-05, "loss": 0.362, "step": 933 }, { "epoch": 2.0, "grad_norm": 0.21641052576639658, "learning_rate": 1.8531746031746032e-05, "loss": 0.3479, "step": 934 }, { "epoch": 2.0021413276231264, "grad_norm": 0.3127795260229852, "learning_rate": 1.8492063492063493e-05, "loss": 0.2938, "step": 935 }, { "epoch": 2.0042826552462527, "grad_norm": 0.20537155001189475, "learning_rate": 1.8452380952380954e-05, "loss": 0.2725, "step": 936 }, { "epoch": 2.006423982869379, "grad_norm": 0.4035860023057039, "learning_rate": 1.8412698412698415e-05, "loss": 0.2877, "step": 937 }, { "epoch": 2.0085653104925054, "grad_norm": 0.2729391560493891, "learning_rate": 1.8373015873015875e-05, "loss": 0.2821, "step": 938 }, { "epoch": 2.0107066381156318, "grad_norm": 0.23930664003910482, "learning_rate": 1.8333333333333333e-05, "loss": 0.2732, "step": 939 }, { "epoch": 2.012847965738758, "grad_norm": 0.2534172257988802, "learning_rate": 1.8293650793650794e-05, "loss": 0.2823, "step": 940 }, { "epoch": 2.0149892933618845, "grad_norm": 0.2842003636688084, "learning_rate": 1.8253968253968254e-05, "loss": 0.2699, "step": 941 }, { "epoch": 2.017130620985011, "grad_norm": 0.2546284637134941, "learning_rate": 1.8214285714285715e-05, "loss": 0.2898, "step": 942 }, { "epoch": 2.019271948608137, "grad_norm": 0.25923976328845644, "learning_rate": 1.8174603174603176e-05, "loss": 0.2835, "step": 943 }, { "epoch": 2.0214132762312635, "grad_norm": 0.2694090119639284, "learning_rate": 1.8134920634920637e-05, "loss": 0.2789, "step": 944 }, { "epoch": 2.02355460385439, "grad_norm": 0.2560183056621401, "learning_rate": 1.8095238095238094e-05, "loss": 0.288, "step": 945 }, { "epoch": 2.0256959314775163, "grad_norm": 0.20946395889175382, "learning_rate": 1.8055555555555555e-05, "loss": 0.2726, "step": 946 }, { "epoch": 2.0278372591006426, "grad_norm": 0.23286538060698495, "learning_rate": 1.8015873015873015e-05, "loss": 0.2649, "step": 947 }, { "epoch": 2.0299785867237685, "grad_norm": 0.2812062902226991, "learning_rate": 1.7976190476190476e-05, "loss": 0.2761, "step": 948 }, { "epoch": 2.032119914346895, "grad_norm": 0.21066691235196103, "learning_rate": 1.7936507936507937e-05, "loss": 0.2767, "step": 949 }, { "epoch": 2.0342612419700212, "grad_norm": 0.22546963160204261, "learning_rate": 1.7896825396825398e-05, "loss": 0.2748, "step": 950 }, { "epoch": 2.0364025695931476, "grad_norm": 0.2695316483089908, "learning_rate": 1.785714285714286e-05, "loss": 0.2931, "step": 951 }, { "epoch": 2.038543897216274, "grad_norm": 0.224650667985638, "learning_rate": 1.781746031746032e-05, "loss": 0.286, "step": 952 }, { "epoch": 2.0406852248394003, "grad_norm": 0.23425810863567934, "learning_rate": 1.777777777777778e-05, "loss": 0.2768, "step": 953 }, { "epoch": 2.0428265524625266, "grad_norm": 0.21885265388287464, "learning_rate": 1.773809523809524e-05, "loss": 0.2664, "step": 954 }, { "epoch": 2.044967880085653, "grad_norm": 0.2220128915627926, "learning_rate": 1.76984126984127e-05, "loss": 0.2792, "step": 955 }, { "epoch": 2.0471092077087794, "grad_norm": 0.21795261361627535, "learning_rate": 1.7658730158730162e-05, "loss": 0.2963, "step": 956 }, { "epoch": 2.0492505353319057, "grad_norm": 0.22853139984187426, "learning_rate": 1.761904761904762e-05, "loss": 0.2774, "step": 957 }, { "epoch": 2.051391862955032, "grad_norm": 0.21005634342559973, "learning_rate": 1.757936507936508e-05, "loss": 0.278, "step": 958 }, { "epoch": 2.0535331905781584, "grad_norm": 0.1972736100196403, "learning_rate": 1.7539682539682538e-05, "loss": 0.276, "step": 959 }, { "epoch": 2.0556745182012848, "grad_norm": 0.2013981720969393, "learning_rate": 1.75e-05, "loss": 0.2725, "step": 960 }, { "epoch": 2.057815845824411, "grad_norm": 0.21114011142047054, "learning_rate": 1.746031746031746e-05, "loss": 0.2692, "step": 961 }, { "epoch": 2.0599571734475375, "grad_norm": 0.19810710536290826, "learning_rate": 1.742063492063492e-05, "loss": 0.2689, "step": 962 }, { "epoch": 2.062098501070664, "grad_norm": 0.20962838769163566, "learning_rate": 1.738095238095238e-05, "loss": 0.2755, "step": 963 }, { "epoch": 2.06423982869379, "grad_norm": 0.2007780468677198, "learning_rate": 1.734126984126984e-05, "loss": 0.2742, "step": 964 }, { "epoch": 2.0663811563169165, "grad_norm": 0.20730329087379573, "learning_rate": 1.7301587301587302e-05, "loss": 0.2796, "step": 965 }, { "epoch": 2.068522483940043, "grad_norm": 0.20422916553473747, "learning_rate": 1.7261904761904763e-05, "loss": 0.2717, "step": 966 }, { "epoch": 2.0706638115631693, "grad_norm": 0.21624827975621064, "learning_rate": 1.7222222222222224e-05, "loss": 0.2736, "step": 967 }, { "epoch": 2.0728051391862956, "grad_norm": 0.211530596933148, "learning_rate": 1.7182539682539684e-05, "loss": 0.2856, "step": 968 }, { "epoch": 2.074946466809422, "grad_norm": 0.21392283135095658, "learning_rate": 1.7142857142857145e-05, "loss": 0.2784, "step": 969 }, { "epoch": 2.0770877944325483, "grad_norm": 0.22575272429775897, "learning_rate": 1.7103174603174606e-05, "loss": 0.2683, "step": 970 }, { "epoch": 2.0792291220556747, "grad_norm": 0.20807075241886105, "learning_rate": 1.7063492063492063e-05, "loss": 0.2811, "step": 971 }, { "epoch": 2.081370449678801, "grad_norm": 0.19696007356880943, "learning_rate": 1.7023809523809524e-05, "loss": 0.2616, "step": 972 }, { "epoch": 2.0835117773019274, "grad_norm": 0.2067907488812531, "learning_rate": 1.6984126984126985e-05, "loss": 0.2867, "step": 973 }, { "epoch": 2.0856531049250537, "grad_norm": 0.20785661967310565, "learning_rate": 1.6944444444444446e-05, "loss": 0.2804, "step": 974 }, { "epoch": 2.08779443254818, "grad_norm": 0.21316354407545085, "learning_rate": 1.6904761904761906e-05, "loss": 0.2825, "step": 975 }, { "epoch": 2.089935760171306, "grad_norm": 0.20417574340319766, "learning_rate": 1.6865079365079367e-05, "loss": 0.2698, "step": 976 }, { "epoch": 2.0920770877944324, "grad_norm": 0.20059086132923634, "learning_rate": 1.6825396825396828e-05, "loss": 0.2838, "step": 977 }, { "epoch": 2.0942184154175587, "grad_norm": 0.20806774530665742, "learning_rate": 1.6785714285714285e-05, "loss": 0.2954, "step": 978 }, { "epoch": 2.096359743040685, "grad_norm": 0.21795686396623343, "learning_rate": 1.6746031746031746e-05, "loss": 0.2758, "step": 979 }, { "epoch": 2.0985010706638114, "grad_norm": 0.20525399917369724, "learning_rate": 1.6706349206349207e-05, "loss": 0.2852, "step": 980 }, { "epoch": 2.1006423982869378, "grad_norm": 0.20153493188561758, "learning_rate": 1.6666666666666667e-05, "loss": 0.2671, "step": 981 }, { "epoch": 2.102783725910064, "grad_norm": 0.21682029099052538, "learning_rate": 1.6626984126984128e-05, "loss": 0.2794, "step": 982 }, { "epoch": 2.1049250535331905, "grad_norm": 0.20982679296246803, "learning_rate": 1.658730158730159e-05, "loss": 0.2755, "step": 983 }, { "epoch": 2.107066381156317, "grad_norm": 0.20945525288086686, "learning_rate": 1.6547619047619046e-05, "loss": 0.2675, "step": 984 }, { "epoch": 2.109207708779443, "grad_norm": 0.21376143878454182, "learning_rate": 1.6507936507936507e-05, "loss": 0.2832, "step": 985 }, { "epoch": 2.1113490364025695, "grad_norm": 0.19436511910516777, "learning_rate": 1.6468253968253968e-05, "loss": 0.2737, "step": 986 }, { "epoch": 2.113490364025696, "grad_norm": 0.20930636270391903, "learning_rate": 1.642857142857143e-05, "loss": 0.2758, "step": 987 }, { "epoch": 2.1156316916488223, "grad_norm": 0.23244369003497906, "learning_rate": 1.638888888888889e-05, "loss": 0.296, "step": 988 }, { "epoch": 2.1177730192719486, "grad_norm": 0.19066256457699418, "learning_rate": 1.634920634920635e-05, "loss": 0.2738, "step": 989 }, { "epoch": 2.119914346895075, "grad_norm": 0.19911072991641976, "learning_rate": 1.630952380952381e-05, "loss": 0.2715, "step": 990 }, { "epoch": 2.1220556745182013, "grad_norm": 0.21201959394684514, "learning_rate": 1.626984126984127e-05, "loss": 0.2922, "step": 991 }, { "epoch": 2.1241970021413277, "grad_norm": 0.1967089705488496, "learning_rate": 1.6230158730158732e-05, "loss": 0.2715, "step": 992 }, { "epoch": 2.126338329764454, "grad_norm": 0.19369231393643502, "learning_rate": 1.6190476190476193e-05, "loss": 0.2784, "step": 993 }, { "epoch": 2.1284796573875804, "grad_norm": 0.2040785519443548, "learning_rate": 1.6150793650793654e-05, "loss": 0.2767, "step": 994 }, { "epoch": 2.1306209850107067, "grad_norm": 0.1898501436472166, "learning_rate": 1.6111111111111115e-05, "loss": 0.2678, "step": 995 }, { "epoch": 2.132762312633833, "grad_norm": 0.20111950499112383, "learning_rate": 1.6071428571428572e-05, "loss": 0.2823, "step": 996 }, { "epoch": 2.1349036402569594, "grad_norm": 0.1981498781288381, "learning_rate": 1.6031746031746033e-05, "loss": 0.2747, "step": 997 }, { "epoch": 2.137044967880086, "grad_norm": 0.19271209341880322, "learning_rate": 1.599206349206349e-05, "loss": 0.2752, "step": 998 }, { "epoch": 2.139186295503212, "grad_norm": 0.2221235505452683, "learning_rate": 1.595238095238095e-05, "loss": 0.2716, "step": 999 }, { "epoch": 2.1413276231263385, "grad_norm": 0.19713198559950088, "learning_rate": 1.591269841269841e-05, "loss": 0.2812, "step": 1000 }, { "epoch": 2.143468950749465, "grad_norm": 0.19897355551674525, "learning_rate": 1.5873015873015872e-05, "loss": 0.2846, "step": 1001 }, { "epoch": 2.145610278372591, "grad_norm": 0.19298432452205258, "learning_rate": 1.5833333333333333e-05, "loss": 0.2682, "step": 1002 }, { "epoch": 2.147751605995717, "grad_norm": 0.2019146536667124, "learning_rate": 1.5793650793650794e-05, "loss": 0.2735, "step": 1003 }, { "epoch": 2.1498929336188435, "grad_norm": 0.20154869107354417, "learning_rate": 1.5753968253968255e-05, "loss": 0.2632, "step": 1004 }, { "epoch": 2.15203426124197, "grad_norm": 0.19363808622473339, "learning_rate": 1.5714285714285715e-05, "loss": 0.2874, "step": 1005 }, { "epoch": 2.154175588865096, "grad_norm": 0.2046720100698483, "learning_rate": 1.5674603174603176e-05, "loss": 0.2754, "step": 1006 }, { "epoch": 2.1563169164882225, "grad_norm": 0.1930006937085633, "learning_rate": 1.5634920634920637e-05, "loss": 0.2726, "step": 1007 }, { "epoch": 2.158458244111349, "grad_norm": 0.20734097354646286, "learning_rate": 1.5595238095238098e-05, "loss": 0.2879, "step": 1008 }, { "epoch": 2.1605995717344753, "grad_norm": 0.19538298745906257, "learning_rate": 1.5555555555555555e-05, "loss": 0.2809, "step": 1009 }, { "epoch": 2.1627408993576016, "grad_norm": 0.2252535619453128, "learning_rate": 1.5515873015873016e-05, "loss": 0.286, "step": 1010 }, { "epoch": 2.164882226980728, "grad_norm": 0.19426151913359876, "learning_rate": 1.5476190476190476e-05, "loss": 0.2806, "step": 1011 }, { "epoch": 2.1670235546038543, "grad_norm": 0.20018884891444158, "learning_rate": 1.5436507936507937e-05, "loss": 0.2922, "step": 1012 }, { "epoch": 2.1691648822269807, "grad_norm": 0.20745084715389592, "learning_rate": 1.5396825396825398e-05, "loss": 0.272, "step": 1013 }, { "epoch": 2.171306209850107, "grad_norm": 0.2005966941166649, "learning_rate": 1.535714285714286e-05, "loss": 0.2748, "step": 1014 }, { "epoch": 2.1734475374732334, "grad_norm": 0.1959554222950539, "learning_rate": 1.531746031746032e-05, "loss": 0.2725, "step": 1015 }, { "epoch": 2.1755888650963597, "grad_norm": 0.2024673027309239, "learning_rate": 1.527777777777778e-05, "loss": 0.2925, "step": 1016 }, { "epoch": 2.177730192719486, "grad_norm": 0.18968936546486206, "learning_rate": 1.5238095238095241e-05, "loss": 0.2734, "step": 1017 }, { "epoch": 2.1798715203426124, "grad_norm": 0.21451121125699396, "learning_rate": 1.5198412698412698e-05, "loss": 0.2729, "step": 1018 }, { "epoch": 2.182012847965739, "grad_norm": 0.20234427279725273, "learning_rate": 1.5158730158730159e-05, "loss": 0.2846, "step": 1019 }, { "epoch": 2.184154175588865, "grad_norm": 0.19307169286798295, "learning_rate": 1.5119047619047618e-05, "loss": 0.2704, "step": 1020 }, { "epoch": 2.1862955032119915, "grad_norm": 0.1831847011586216, "learning_rate": 1.5079365079365079e-05, "loss": 0.2682, "step": 1021 }, { "epoch": 2.188436830835118, "grad_norm": 0.19718660927111226, "learning_rate": 1.503968253968254e-05, "loss": 0.2665, "step": 1022 }, { "epoch": 2.190578158458244, "grad_norm": 0.21657711095140936, "learning_rate": 1.5e-05, "loss": 0.2793, "step": 1023 }, { "epoch": 2.1927194860813706, "grad_norm": 0.2085788754213796, "learning_rate": 1.4960317460317461e-05, "loss": 0.2816, "step": 1024 }, { "epoch": 2.194860813704497, "grad_norm": 0.20682253217524985, "learning_rate": 1.4920634920634922e-05, "loss": 0.2861, "step": 1025 }, { "epoch": 2.1970021413276233, "grad_norm": 0.19353447072521687, "learning_rate": 1.4880952380952381e-05, "loss": 0.2818, "step": 1026 }, { "epoch": 2.1991434689507496, "grad_norm": 0.2023000282481551, "learning_rate": 1.4841269841269842e-05, "loss": 0.2897, "step": 1027 }, { "epoch": 2.201284796573876, "grad_norm": 0.2091382321658723, "learning_rate": 1.4801587301587302e-05, "loss": 0.2803, "step": 1028 }, { "epoch": 2.2034261241970023, "grad_norm": 0.19040214411798756, "learning_rate": 1.4761904761904763e-05, "loss": 0.2709, "step": 1029 }, { "epoch": 2.2055674518201283, "grad_norm": 0.18805950537913466, "learning_rate": 1.4722222222222224e-05, "loss": 0.2641, "step": 1030 }, { "epoch": 2.207708779443255, "grad_norm": 0.19809642692965637, "learning_rate": 1.4682539682539683e-05, "loss": 0.2886, "step": 1031 }, { "epoch": 2.209850107066381, "grad_norm": 0.1912784314191019, "learning_rate": 1.4642857142857144e-05, "loss": 0.2795, "step": 1032 }, { "epoch": 2.2119914346895073, "grad_norm": 0.1966882118912259, "learning_rate": 1.4603174603174605e-05, "loss": 0.2837, "step": 1033 }, { "epoch": 2.2141327623126337, "grad_norm": 0.19981226873116584, "learning_rate": 1.4563492063492065e-05, "loss": 0.2798, "step": 1034 }, { "epoch": 2.21627408993576, "grad_norm": 0.19609944493947298, "learning_rate": 1.4523809523809526e-05, "loss": 0.2851, "step": 1035 }, { "epoch": 2.2184154175588864, "grad_norm": 0.20172541461591478, "learning_rate": 1.4484126984126987e-05, "loss": 0.2806, "step": 1036 }, { "epoch": 2.2205567451820127, "grad_norm": 0.2184127818693799, "learning_rate": 1.4444444444444444e-05, "loss": 0.2788, "step": 1037 }, { "epoch": 2.222698072805139, "grad_norm": 0.1934456256040119, "learning_rate": 1.4404761904761905e-05, "loss": 0.2684, "step": 1038 }, { "epoch": 2.2248394004282654, "grad_norm": 0.2082433040162603, "learning_rate": 1.4365079365079364e-05, "loss": 0.2923, "step": 1039 }, { "epoch": 2.226980728051392, "grad_norm": 0.22380158655105736, "learning_rate": 1.4325396825396825e-05, "loss": 0.2779, "step": 1040 }, { "epoch": 2.229122055674518, "grad_norm": 0.1942322520767476, "learning_rate": 1.4285714285714285e-05, "loss": 0.2759, "step": 1041 }, { "epoch": 2.2312633832976445, "grad_norm": 0.1974411304770536, "learning_rate": 1.4246031746031746e-05, "loss": 0.2688, "step": 1042 }, { "epoch": 2.233404710920771, "grad_norm": 0.19709284568411106, "learning_rate": 1.4206349206349207e-05, "loss": 0.2789, "step": 1043 }, { "epoch": 2.235546038543897, "grad_norm": 0.19313497813772162, "learning_rate": 1.4166666666666668e-05, "loss": 0.28, "step": 1044 }, { "epoch": 2.2376873661670236, "grad_norm": 0.19756395359956005, "learning_rate": 1.4126984126984127e-05, "loss": 0.287, "step": 1045 }, { "epoch": 2.23982869379015, "grad_norm": 0.19707117382557698, "learning_rate": 1.4087301587301587e-05, "loss": 0.2793, "step": 1046 }, { "epoch": 2.2419700214132763, "grad_norm": 0.2185295361546076, "learning_rate": 1.4047619047619048e-05, "loss": 0.2801, "step": 1047 }, { "epoch": 2.2441113490364026, "grad_norm": 0.1959782383553124, "learning_rate": 1.4007936507936509e-05, "loss": 0.2778, "step": 1048 }, { "epoch": 2.246252676659529, "grad_norm": 0.20220432730319396, "learning_rate": 1.396825396825397e-05, "loss": 0.2739, "step": 1049 }, { "epoch": 2.2483940042826553, "grad_norm": 0.203202312939082, "learning_rate": 1.392857142857143e-05, "loss": 0.2686, "step": 1050 }, { "epoch": 2.2505353319057817, "grad_norm": 0.19629196239471303, "learning_rate": 1.388888888888889e-05, "loss": 0.2741, "step": 1051 }, { "epoch": 2.252676659528908, "grad_norm": 0.20785913348172835, "learning_rate": 1.384920634920635e-05, "loss": 0.2738, "step": 1052 }, { "epoch": 2.2548179871520344, "grad_norm": 0.20245154360926115, "learning_rate": 1.3809523809523811e-05, "loss": 0.2944, "step": 1053 }, { "epoch": 2.2569593147751608, "grad_norm": 0.2096114189668644, "learning_rate": 1.3769841269841272e-05, "loss": 0.2759, "step": 1054 }, { "epoch": 2.259100642398287, "grad_norm": 0.24320117601366367, "learning_rate": 1.3730158730158733e-05, "loss": 0.271, "step": 1055 }, { "epoch": 2.2612419700214135, "grad_norm": 0.19424837338985246, "learning_rate": 1.3690476190476192e-05, "loss": 0.2793, "step": 1056 }, { "epoch": 2.2633832976445394, "grad_norm": 0.19235543786637513, "learning_rate": 1.365079365079365e-05, "loss": 0.2797, "step": 1057 }, { "epoch": 2.265524625267666, "grad_norm": 0.2014689883501026, "learning_rate": 1.3611111111111111e-05, "loss": 0.2804, "step": 1058 }, { "epoch": 2.267665952890792, "grad_norm": 0.19388371477246316, "learning_rate": 1.357142857142857e-05, "loss": 0.2857, "step": 1059 }, { "epoch": 2.2698072805139184, "grad_norm": 0.18600354706155486, "learning_rate": 1.3531746031746031e-05, "loss": 0.268, "step": 1060 }, { "epoch": 2.271948608137045, "grad_norm": 0.1997457077450587, "learning_rate": 1.3492063492063492e-05, "loss": 0.2744, "step": 1061 }, { "epoch": 2.274089935760171, "grad_norm": 0.21254455192384775, "learning_rate": 1.3452380952380953e-05, "loss": 0.281, "step": 1062 }, { "epoch": 2.2762312633832975, "grad_norm": 0.19277682702608626, "learning_rate": 1.3412698412698413e-05, "loss": 0.2766, "step": 1063 }, { "epoch": 2.278372591006424, "grad_norm": 0.19996995603399093, "learning_rate": 1.3373015873015873e-05, "loss": 0.2811, "step": 1064 }, { "epoch": 2.28051391862955, "grad_norm": 0.2037157346895961, "learning_rate": 1.3333333333333333e-05, "loss": 0.286, "step": 1065 }, { "epoch": 2.2826552462526766, "grad_norm": 0.2038456192268336, "learning_rate": 1.3293650793650794e-05, "loss": 0.2756, "step": 1066 }, { "epoch": 2.284796573875803, "grad_norm": 0.19897665841544188, "learning_rate": 1.3253968253968255e-05, "loss": 0.2726, "step": 1067 }, { "epoch": 2.2869379014989293, "grad_norm": 0.2001072986623387, "learning_rate": 1.3214285714285716e-05, "loss": 0.2795, "step": 1068 }, { "epoch": 2.2890792291220556, "grad_norm": 0.19517842971444538, "learning_rate": 1.3174603174603176e-05, "loss": 0.2635, "step": 1069 }, { "epoch": 2.291220556745182, "grad_norm": 0.2068829830909192, "learning_rate": 1.3134920634920635e-05, "loss": 0.2791, "step": 1070 }, { "epoch": 2.2933618843683083, "grad_norm": 0.21700568586294433, "learning_rate": 1.3095238095238096e-05, "loss": 0.3014, "step": 1071 }, { "epoch": 2.2955032119914347, "grad_norm": 0.19043215131446242, "learning_rate": 1.3055555555555557e-05, "loss": 0.2772, "step": 1072 }, { "epoch": 2.297644539614561, "grad_norm": 0.20431273180658213, "learning_rate": 1.3015873015873018e-05, "loss": 0.2738, "step": 1073 }, { "epoch": 2.2997858672376874, "grad_norm": 0.1971225707821823, "learning_rate": 1.2976190476190478e-05, "loss": 0.2762, "step": 1074 }, { "epoch": 2.3019271948608138, "grad_norm": 0.19556074151794256, "learning_rate": 1.2936507936507939e-05, "loss": 0.2572, "step": 1075 }, { "epoch": 2.30406852248394, "grad_norm": 0.20833957821982024, "learning_rate": 1.2896825396825398e-05, "loss": 0.2794, "step": 1076 }, { "epoch": 2.3062098501070665, "grad_norm": 0.2025740903256515, "learning_rate": 1.2857142857142857e-05, "loss": 0.2733, "step": 1077 }, { "epoch": 2.308351177730193, "grad_norm": 0.20532981633640338, "learning_rate": 1.2817460317460316e-05, "loss": 0.2801, "step": 1078 }, { "epoch": 2.310492505353319, "grad_norm": 0.20302693087098408, "learning_rate": 1.2777777777777777e-05, "loss": 0.2855, "step": 1079 }, { "epoch": 2.3126338329764455, "grad_norm": 0.19316097914518776, "learning_rate": 1.2738095238095238e-05, "loss": 0.2744, "step": 1080 }, { "epoch": 2.314775160599572, "grad_norm": 0.1890481893959948, "learning_rate": 1.2698412698412699e-05, "loss": 0.2858, "step": 1081 }, { "epoch": 2.3169164882226982, "grad_norm": 0.20709172274897492, "learning_rate": 1.265873015873016e-05, "loss": 0.287, "step": 1082 }, { "epoch": 2.3190578158458246, "grad_norm": 0.21179564472663326, "learning_rate": 1.261904761904762e-05, "loss": 0.2837, "step": 1083 }, { "epoch": 2.3211991434689505, "grad_norm": 0.2076511560969458, "learning_rate": 1.2579365079365079e-05, "loss": 0.2809, "step": 1084 }, { "epoch": 2.3233404710920773, "grad_norm": 0.2053185208412779, "learning_rate": 1.253968253968254e-05, "loss": 0.2778, "step": 1085 }, { "epoch": 2.325481798715203, "grad_norm": 0.19958273635866866, "learning_rate": 1.25e-05, "loss": 0.2637, "step": 1086 }, { "epoch": 2.3276231263383296, "grad_norm": 0.2089718737763175, "learning_rate": 1.2460317460317461e-05, "loss": 0.2871, "step": 1087 }, { "epoch": 2.329764453961456, "grad_norm": 0.2051818133160847, "learning_rate": 1.2420634920634922e-05, "loss": 0.2973, "step": 1088 }, { "epoch": 2.3319057815845823, "grad_norm": 0.6114154780985942, "learning_rate": 1.2380952380952381e-05, "loss": 0.2961, "step": 1089 }, { "epoch": 2.3340471092077086, "grad_norm": 0.1835556937496075, "learning_rate": 1.2341269841269842e-05, "loss": 0.269, "step": 1090 }, { "epoch": 2.336188436830835, "grad_norm": 0.20213777732489752, "learning_rate": 1.2301587301587301e-05, "loss": 0.2747, "step": 1091 }, { "epoch": 2.3383297644539613, "grad_norm": 0.21427750205781732, "learning_rate": 1.2261904761904762e-05, "loss": 0.2811, "step": 1092 }, { "epoch": 2.3404710920770877, "grad_norm": 0.2087757066721972, "learning_rate": 1.2222222222222222e-05, "loss": 0.2787, "step": 1093 }, { "epoch": 2.342612419700214, "grad_norm": 0.18757584192245175, "learning_rate": 1.2182539682539683e-05, "loss": 0.2889, "step": 1094 }, { "epoch": 2.3447537473233404, "grad_norm": 0.20608037222439926, "learning_rate": 1.2142857142857144e-05, "loss": 0.2789, "step": 1095 }, { "epoch": 2.3468950749464668, "grad_norm": 0.21935230178468215, "learning_rate": 1.2103174603174603e-05, "loss": 0.285, "step": 1096 }, { "epoch": 2.349036402569593, "grad_norm": 0.21133802393480924, "learning_rate": 1.2063492063492064e-05, "loss": 0.2899, "step": 1097 }, { "epoch": 2.3511777301927195, "grad_norm": 0.18791527322276846, "learning_rate": 1.2023809523809525e-05, "loss": 0.2742, "step": 1098 }, { "epoch": 2.353319057815846, "grad_norm": 0.1971862704976824, "learning_rate": 1.1984126984126985e-05, "loss": 0.2847, "step": 1099 }, { "epoch": 2.355460385438972, "grad_norm": 0.21189728780479716, "learning_rate": 1.1944444444444446e-05, "loss": 0.2872, "step": 1100 }, { "epoch": 2.3576017130620985, "grad_norm": 0.19227294980450937, "learning_rate": 1.1904761904761905e-05, "loss": 0.2907, "step": 1101 }, { "epoch": 2.359743040685225, "grad_norm": 0.20548532335329017, "learning_rate": 1.1865079365079366e-05, "loss": 0.2643, "step": 1102 }, { "epoch": 2.3618843683083512, "grad_norm": 0.18375693623718486, "learning_rate": 1.1825396825396825e-05, "loss": 0.2735, "step": 1103 }, { "epoch": 2.3640256959314776, "grad_norm": 0.21294408502746626, "learning_rate": 1.1785714285714286e-05, "loss": 0.281, "step": 1104 }, { "epoch": 2.366167023554604, "grad_norm": 0.19727873974409793, "learning_rate": 1.1746031746031746e-05, "loss": 0.281, "step": 1105 }, { "epoch": 2.3683083511777303, "grad_norm": 0.18895600615844696, "learning_rate": 1.1706349206349207e-05, "loss": 0.2852, "step": 1106 }, { "epoch": 2.3704496788008567, "grad_norm": 0.19703744794029157, "learning_rate": 1.1666666666666668e-05, "loss": 0.2856, "step": 1107 }, { "epoch": 2.372591006423983, "grad_norm": 0.20958076853632984, "learning_rate": 1.1626984126984129e-05, "loss": 0.279, "step": 1108 }, { "epoch": 2.3747323340471094, "grad_norm": 0.19332254165250848, "learning_rate": 1.1587301587301588e-05, "loss": 0.2894, "step": 1109 }, { "epoch": 2.3768736616702357, "grad_norm": 0.19661719515205164, "learning_rate": 1.1547619047619048e-05, "loss": 0.2702, "step": 1110 }, { "epoch": 2.3790149892933616, "grad_norm": 0.1907420302248947, "learning_rate": 1.1507936507936508e-05, "loss": 0.2771, "step": 1111 }, { "epoch": 2.3811563169164884, "grad_norm": 0.18023957300416488, "learning_rate": 1.1468253968253968e-05, "loss": 0.2717, "step": 1112 }, { "epoch": 2.3832976445396143, "grad_norm": 0.2072993037766396, "learning_rate": 1.1428571428571429e-05, "loss": 0.2933, "step": 1113 }, { "epoch": 2.385438972162741, "grad_norm": 0.1904556433015721, "learning_rate": 1.138888888888889e-05, "loss": 0.2777, "step": 1114 }, { "epoch": 2.387580299785867, "grad_norm": 0.190065817649398, "learning_rate": 1.1349206349206349e-05, "loss": 0.2737, "step": 1115 }, { "epoch": 2.3897216274089934, "grad_norm": 0.18734473596445086, "learning_rate": 1.130952380952381e-05, "loss": 0.2798, "step": 1116 }, { "epoch": 2.3918629550321198, "grad_norm": 0.20310161931125695, "learning_rate": 1.126984126984127e-05, "loss": 0.2862, "step": 1117 }, { "epoch": 2.394004282655246, "grad_norm": 0.19855963578116342, "learning_rate": 1.1230158730158731e-05, "loss": 0.2784, "step": 1118 }, { "epoch": 2.3961456102783725, "grad_norm": 0.189077523042222, "learning_rate": 1.1190476190476192e-05, "loss": 0.286, "step": 1119 }, { "epoch": 2.398286937901499, "grad_norm": 0.18894803290615364, "learning_rate": 1.1150793650793653e-05, "loss": 0.2779, "step": 1120 }, { "epoch": 2.400428265524625, "grad_norm": 0.2015116046355913, "learning_rate": 1.1111111111111112e-05, "loss": 0.2865, "step": 1121 }, { "epoch": 2.4025695931477515, "grad_norm": 0.19437606708029698, "learning_rate": 1.107142857142857e-05, "loss": 0.2933, "step": 1122 }, { "epoch": 2.404710920770878, "grad_norm": 0.18852414654829355, "learning_rate": 1.1031746031746031e-05, "loss": 0.284, "step": 1123 }, { "epoch": 2.4068522483940042, "grad_norm": 0.1879656318380588, "learning_rate": 1.0992063492063492e-05, "loss": 0.2874, "step": 1124 }, { "epoch": 2.4089935760171306, "grad_norm": 0.19791655089067275, "learning_rate": 1.0952380952380953e-05, "loss": 0.2786, "step": 1125 }, { "epoch": 2.411134903640257, "grad_norm": 0.20226850687509584, "learning_rate": 1.0912698412698414e-05, "loss": 0.2922, "step": 1126 }, { "epoch": 2.4132762312633833, "grad_norm": 0.18552400783172202, "learning_rate": 1.0873015873015874e-05, "loss": 0.2757, "step": 1127 }, { "epoch": 2.4154175588865097, "grad_norm": 0.1837825345997371, "learning_rate": 1.0833333333333334e-05, "loss": 0.2764, "step": 1128 }, { "epoch": 2.417558886509636, "grad_norm": 0.19528894541437605, "learning_rate": 1.0793650793650794e-05, "loss": 0.2742, "step": 1129 }, { "epoch": 2.4197002141327624, "grad_norm": 0.21464260349692135, "learning_rate": 1.0753968253968255e-05, "loss": 0.2873, "step": 1130 }, { "epoch": 2.4218415417558887, "grad_norm": 0.1945459121129588, "learning_rate": 1.0714285714285714e-05, "loss": 0.2821, "step": 1131 }, { "epoch": 2.423982869379015, "grad_norm": 0.18478419433751667, "learning_rate": 1.0674603174603175e-05, "loss": 0.2826, "step": 1132 }, { "epoch": 2.4261241970021414, "grad_norm": 0.20225910020937835, "learning_rate": 1.0634920634920636e-05, "loss": 0.2873, "step": 1133 }, { "epoch": 2.428265524625268, "grad_norm": 0.20292439805906615, "learning_rate": 1.0595238095238096e-05, "loss": 0.282, "step": 1134 }, { "epoch": 2.430406852248394, "grad_norm": 0.18473437227064216, "learning_rate": 1.0555555555555555e-05, "loss": 0.269, "step": 1135 }, { "epoch": 2.4325481798715205, "grad_norm": 0.211980884226004, "learning_rate": 1.0515873015873016e-05, "loss": 0.2882, "step": 1136 }, { "epoch": 2.434689507494647, "grad_norm": 0.22535605642238996, "learning_rate": 1.0476190476190477e-05, "loss": 0.265, "step": 1137 }, { "epoch": 2.436830835117773, "grad_norm": 0.1991542202837908, "learning_rate": 1.0436507936507938e-05, "loss": 0.2887, "step": 1138 }, { "epoch": 2.4389721627408996, "grad_norm": 0.2072336431629644, "learning_rate": 1.0396825396825398e-05, "loss": 0.2805, "step": 1139 }, { "epoch": 2.4411134903640255, "grad_norm": 0.20543914654797396, "learning_rate": 1.0357142857142859e-05, "loss": 0.2905, "step": 1140 }, { "epoch": 2.4432548179871523, "grad_norm": 0.20157112476161199, "learning_rate": 1.0317460317460318e-05, "loss": 0.2843, "step": 1141 }, { "epoch": 2.445396145610278, "grad_norm": 0.1916041907370502, "learning_rate": 1.0277777777777777e-05, "loss": 0.2916, "step": 1142 }, { "epoch": 2.4475374732334045, "grad_norm": 0.19261294432915077, "learning_rate": 1.0238095238095238e-05, "loss": 0.2766, "step": 1143 }, { "epoch": 2.449678800856531, "grad_norm": 0.21519445528802897, "learning_rate": 1.0198412698412699e-05, "loss": 0.2819, "step": 1144 }, { "epoch": 2.4518201284796572, "grad_norm": 0.19022925356343928, "learning_rate": 1.015873015873016e-05, "loss": 0.2633, "step": 1145 }, { "epoch": 2.4539614561027836, "grad_norm": 0.18842920918262623, "learning_rate": 1.011904761904762e-05, "loss": 0.2701, "step": 1146 }, { "epoch": 2.45610278372591, "grad_norm": 0.18644571326970072, "learning_rate": 1.007936507936508e-05, "loss": 0.2747, "step": 1147 }, { "epoch": 2.4582441113490363, "grad_norm": 0.20125355865381975, "learning_rate": 1.003968253968254e-05, "loss": 0.2627, "step": 1148 }, { "epoch": 2.4603854389721627, "grad_norm": 0.21999175240537394, "learning_rate": 1e-05, "loss": 0.2805, "step": 1149 }, { "epoch": 2.462526766595289, "grad_norm": 0.19787370820057412, "learning_rate": 9.96031746031746e-06, "loss": 0.2811, "step": 1150 }, { "epoch": 2.4646680942184154, "grad_norm": 0.18955007917593744, "learning_rate": 9.92063492063492e-06, "loss": 0.286, "step": 1151 }, { "epoch": 2.4668094218415417, "grad_norm": 0.20431736362786296, "learning_rate": 9.880952380952381e-06, "loss": 0.2657, "step": 1152 }, { "epoch": 2.468950749464668, "grad_norm": 0.19320732235445437, "learning_rate": 9.841269841269842e-06, "loss": 0.2791, "step": 1153 }, { "epoch": 2.4710920770877944, "grad_norm": 0.19183959802084546, "learning_rate": 9.801587301587301e-06, "loss": 0.2702, "step": 1154 }, { "epoch": 2.473233404710921, "grad_norm": 0.19805919853745266, "learning_rate": 9.761904761904762e-06, "loss": 0.2858, "step": 1155 }, { "epoch": 2.475374732334047, "grad_norm": 0.1907652749236508, "learning_rate": 9.722222222222223e-06, "loss": 0.294, "step": 1156 }, { "epoch": 2.4775160599571735, "grad_norm": 0.19770176710293932, "learning_rate": 9.682539682539683e-06, "loss": 0.2759, "step": 1157 }, { "epoch": 2.4796573875803, "grad_norm": 0.19222146084786232, "learning_rate": 9.642857142857144e-06, "loss": 0.2732, "step": 1158 }, { "epoch": 2.481798715203426, "grad_norm": 0.20359350613339589, "learning_rate": 9.603174603174605e-06, "loss": 0.2858, "step": 1159 }, { "epoch": 2.4839400428265526, "grad_norm": 0.19032755422213457, "learning_rate": 9.563492063492064e-06, "loss": 0.2742, "step": 1160 }, { "epoch": 2.486081370449679, "grad_norm": 0.19335063670418315, "learning_rate": 9.523809523809523e-06, "loss": 0.2859, "step": 1161 }, { "epoch": 2.4882226980728053, "grad_norm": 0.19897996740846366, "learning_rate": 9.484126984126984e-06, "loss": 0.2857, "step": 1162 }, { "epoch": 2.4903640256959316, "grad_norm": 0.1941351806336689, "learning_rate": 9.444444444444445e-06, "loss": 0.2784, "step": 1163 }, { "epoch": 2.492505353319058, "grad_norm": 0.19765019351850296, "learning_rate": 9.404761904761905e-06, "loss": 0.2761, "step": 1164 }, { "epoch": 2.4946466809421843, "grad_norm": 0.18516337216937107, "learning_rate": 9.365079365079366e-06, "loss": 0.2764, "step": 1165 }, { "epoch": 2.4967880085653107, "grad_norm": 0.18907608503624435, "learning_rate": 9.325396825396827e-06, "loss": 0.277, "step": 1166 }, { "epoch": 2.4989293361884366, "grad_norm": 0.1941807441287669, "learning_rate": 9.285714285714286e-06, "loss": 0.275, "step": 1167 }, { "epoch": 2.5010706638115634, "grad_norm": 0.18811473358774278, "learning_rate": 9.246031746031747e-06, "loss": 0.2825, "step": 1168 }, { "epoch": 2.5032119914346893, "grad_norm": 0.19866610507866173, "learning_rate": 9.206349206349207e-06, "loss": 0.2903, "step": 1169 }, { "epoch": 2.505353319057816, "grad_norm": 0.18733861692093362, "learning_rate": 9.166666666666666e-06, "loss": 0.2873, "step": 1170 }, { "epoch": 2.507494646680942, "grad_norm": 0.20268747557365815, "learning_rate": 9.126984126984127e-06, "loss": 0.2948, "step": 1171 }, { "epoch": 2.5096359743040684, "grad_norm": 0.1996977559423424, "learning_rate": 9.087301587301588e-06, "loss": 0.2713, "step": 1172 }, { "epoch": 2.5117773019271947, "grad_norm": 0.1996076333163632, "learning_rate": 9.047619047619047e-06, "loss": 0.2779, "step": 1173 }, { "epoch": 2.513918629550321, "grad_norm": 0.19011932221335692, "learning_rate": 9.007936507936508e-06, "loss": 0.2803, "step": 1174 }, { "epoch": 2.5160599571734474, "grad_norm": 0.18599656098112288, "learning_rate": 8.968253968253968e-06, "loss": 0.2791, "step": 1175 }, { "epoch": 2.518201284796574, "grad_norm": 0.19678433899725203, "learning_rate": 8.92857142857143e-06, "loss": 0.2822, "step": 1176 }, { "epoch": 2.5203426124197, "grad_norm": 0.1846108520466212, "learning_rate": 8.88888888888889e-06, "loss": 0.266, "step": 1177 }, { "epoch": 2.5224839400428265, "grad_norm": 0.18960292191925746, "learning_rate": 8.84920634920635e-06, "loss": 0.2662, "step": 1178 }, { "epoch": 2.524625267665953, "grad_norm": 0.2200193235205345, "learning_rate": 8.80952380952381e-06, "loss": 0.2854, "step": 1179 }, { "epoch": 2.526766595289079, "grad_norm": 0.18902634217465988, "learning_rate": 8.769841269841269e-06, "loss": 0.2687, "step": 1180 }, { "epoch": 2.5289079229122056, "grad_norm": 0.19098504833889798, "learning_rate": 8.73015873015873e-06, "loss": 0.2735, "step": 1181 }, { "epoch": 2.531049250535332, "grad_norm": 0.19175998158414528, "learning_rate": 8.69047619047619e-06, "loss": 0.2785, "step": 1182 }, { "epoch": 2.5331905781584583, "grad_norm": 0.20056232550718314, "learning_rate": 8.650793650793651e-06, "loss": 0.2941, "step": 1183 }, { "epoch": 2.5353319057815846, "grad_norm": 0.19066315286775598, "learning_rate": 8.611111111111112e-06, "loss": 0.2731, "step": 1184 }, { "epoch": 2.537473233404711, "grad_norm": 0.18331446647935762, "learning_rate": 8.571428571428573e-06, "loss": 0.2758, "step": 1185 }, { "epoch": 2.5396145610278373, "grad_norm": 0.184471748142892, "learning_rate": 8.531746031746032e-06, "loss": 0.2703, "step": 1186 }, { "epoch": 2.5417558886509637, "grad_norm": 0.18934407240554624, "learning_rate": 8.492063492063492e-06, "loss": 0.2917, "step": 1187 }, { "epoch": 2.54389721627409, "grad_norm": 0.1839570377909689, "learning_rate": 8.452380952380953e-06, "loss": 0.2733, "step": 1188 }, { "epoch": 2.5460385438972164, "grad_norm": 0.19505246832240355, "learning_rate": 8.412698412698414e-06, "loss": 0.2821, "step": 1189 }, { "epoch": 2.5481798715203428, "grad_norm": 0.1868170944308521, "learning_rate": 8.373015873015873e-06, "loss": 0.2756, "step": 1190 }, { "epoch": 2.550321199143469, "grad_norm": 0.1863338672371585, "learning_rate": 8.333333333333334e-06, "loss": 0.2779, "step": 1191 }, { "epoch": 2.552462526766595, "grad_norm": 0.18694030749510335, "learning_rate": 8.293650793650794e-06, "loss": 0.2623, "step": 1192 }, { "epoch": 2.554603854389722, "grad_norm": 0.18346739599692435, "learning_rate": 8.253968253968254e-06, "loss": 0.2692, "step": 1193 }, { "epoch": 2.5567451820128477, "grad_norm": 0.193938387145013, "learning_rate": 8.214285714285714e-06, "loss": 0.2762, "step": 1194 }, { "epoch": 2.5588865096359745, "grad_norm": 0.1968770084022562, "learning_rate": 8.174603174603175e-06, "loss": 0.2749, "step": 1195 }, { "epoch": 2.5610278372591004, "grad_norm": 0.2012872958431149, "learning_rate": 8.134920634920636e-06, "loss": 0.2841, "step": 1196 }, { "epoch": 2.5631691648822272, "grad_norm": 0.19648015971972235, "learning_rate": 8.095238095238097e-06, "loss": 0.2677, "step": 1197 }, { "epoch": 2.565310492505353, "grad_norm": 0.18810436672950626, "learning_rate": 8.055555555555557e-06, "loss": 0.2668, "step": 1198 }, { "epoch": 2.5674518201284795, "grad_norm": 0.19172953088721956, "learning_rate": 8.015873015873016e-06, "loss": 0.2754, "step": 1199 }, { "epoch": 2.569593147751606, "grad_norm": 0.2022573936763731, "learning_rate": 7.976190476190475e-06, "loss": 0.2878, "step": 1200 }, { "epoch": 2.571734475374732, "grad_norm": 0.20158630502809705, "learning_rate": 7.936507936507936e-06, "loss": 0.3009, "step": 1201 }, { "epoch": 2.5738758029978586, "grad_norm": 0.19444900388460162, "learning_rate": 7.896825396825397e-06, "loss": 0.2723, "step": 1202 }, { "epoch": 2.576017130620985, "grad_norm": 0.19393996035159938, "learning_rate": 7.857142857142858e-06, "loss": 0.2784, "step": 1203 }, { "epoch": 2.5781584582441113, "grad_norm": 0.20388456521131612, "learning_rate": 7.817460317460318e-06, "loss": 0.2972, "step": 1204 }, { "epoch": 2.5802997858672376, "grad_norm": 0.19046236974876576, "learning_rate": 7.777777777777777e-06, "loss": 0.2866, "step": 1205 }, { "epoch": 2.582441113490364, "grad_norm": 0.21198070199178753, "learning_rate": 7.738095238095238e-06, "loss": 0.2874, "step": 1206 }, { "epoch": 2.5845824411134903, "grad_norm": 0.20591590293289885, "learning_rate": 7.698412698412699e-06, "loss": 0.3017, "step": 1207 }, { "epoch": 2.5867237687366167, "grad_norm": 0.20766349326464448, "learning_rate": 7.65873015873016e-06, "loss": 0.2874, "step": 1208 }, { "epoch": 2.588865096359743, "grad_norm": 0.18365533590321947, "learning_rate": 7.6190476190476205e-06, "loss": 0.2761, "step": 1209 }, { "epoch": 2.5910064239828694, "grad_norm": 0.19032688283851512, "learning_rate": 7.5793650793650795e-06, "loss": 0.2708, "step": 1210 }, { "epoch": 2.5931477516059958, "grad_norm": 0.1980622035151324, "learning_rate": 7.5396825396825394e-06, "loss": 0.269, "step": 1211 }, { "epoch": 2.595289079229122, "grad_norm": 0.1983391081538549, "learning_rate": 7.5e-06, "loss": 0.2796, "step": 1212 }, { "epoch": 2.5974304068522485, "grad_norm": 0.18057379441303084, "learning_rate": 7.460317460317461e-06, "loss": 0.2746, "step": 1213 }, { "epoch": 2.599571734475375, "grad_norm": 0.19068010050148992, "learning_rate": 7.420634920634921e-06, "loss": 0.263, "step": 1214 }, { "epoch": 2.601713062098501, "grad_norm": 0.18991511897664065, "learning_rate": 7.380952380952382e-06, "loss": 0.2773, "step": 1215 }, { "epoch": 2.6038543897216275, "grad_norm": 0.19991447304073107, "learning_rate": 7.3412698412698415e-06, "loss": 0.2946, "step": 1216 }, { "epoch": 2.605995717344754, "grad_norm": 0.20077693323441242, "learning_rate": 7.301587301587302e-06, "loss": 0.2952, "step": 1217 }, { "epoch": 2.6081370449678802, "grad_norm": 0.1910164898967557, "learning_rate": 7.261904761904763e-06, "loss": 0.2871, "step": 1218 }, { "epoch": 2.6102783725910066, "grad_norm": 0.19760453195164532, "learning_rate": 7.222222222222222e-06, "loss": 0.276, "step": 1219 }, { "epoch": 2.612419700214133, "grad_norm": 0.19107969100185188, "learning_rate": 7.182539682539682e-06, "loss": 0.2857, "step": 1220 }, { "epoch": 2.614561027837259, "grad_norm": 0.18522026162274652, "learning_rate": 7.142857142857143e-06, "loss": 0.269, "step": 1221 }, { "epoch": 2.6167023554603857, "grad_norm": 0.1940614696572163, "learning_rate": 7.1031746031746035e-06, "loss": 0.2755, "step": 1222 }, { "epoch": 2.6188436830835116, "grad_norm": 0.19575778265780672, "learning_rate": 7.063492063492063e-06, "loss": 0.2807, "step": 1223 }, { "epoch": 2.6209850107066384, "grad_norm": 0.18786133539258287, "learning_rate": 7.023809523809524e-06, "loss": 0.2726, "step": 1224 }, { "epoch": 2.6231263383297643, "grad_norm": 0.18567364878354917, "learning_rate": 6.984126984126985e-06, "loss": 0.2737, "step": 1225 }, { "epoch": 2.6252676659528906, "grad_norm": 0.18207760252968563, "learning_rate": 6.944444444444445e-06, "loss": 0.2692, "step": 1226 }, { "epoch": 2.627408993576017, "grad_norm": 0.190042944624669, "learning_rate": 6.9047619047619055e-06, "loss": 0.2677, "step": 1227 }, { "epoch": 2.6295503211991433, "grad_norm": 0.18201431307145552, "learning_rate": 6.865079365079366e-06, "loss": 0.2741, "step": 1228 }, { "epoch": 2.6316916488222697, "grad_norm": 0.1865727217496232, "learning_rate": 6.825396825396825e-06, "loss": 0.2671, "step": 1229 }, { "epoch": 2.633832976445396, "grad_norm": 0.1801997882324314, "learning_rate": 6.785714285714285e-06, "loss": 0.2699, "step": 1230 }, { "epoch": 2.6359743040685224, "grad_norm": 0.19389188154337386, "learning_rate": 6.746031746031746e-06, "loss": 0.2761, "step": 1231 }, { "epoch": 2.6381156316916488, "grad_norm": 0.1876099016700767, "learning_rate": 6.706349206349207e-06, "loss": 0.2974, "step": 1232 }, { "epoch": 2.640256959314775, "grad_norm": 0.19121391315236033, "learning_rate": 6.666666666666667e-06, "loss": 0.2988, "step": 1233 }, { "epoch": 2.6423982869379015, "grad_norm": 0.19053822839952972, "learning_rate": 6.626984126984127e-06, "loss": 0.2797, "step": 1234 }, { "epoch": 2.644539614561028, "grad_norm": 0.18505965289729664, "learning_rate": 6.587301587301588e-06, "loss": 0.2666, "step": 1235 }, { "epoch": 2.646680942184154, "grad_norm": 0.1893938662916577, "learning_rate": 6.547619047619048e-06, "loss": 0.2719, "step": 1236 }, { "epoch": 2.6488222698072805, "grad_norm": 0.1938288347509404, "learning_rate": 6.507936507936509e-06, "loss": 0.2823, "step": 1237 }, { "epoch": 2.650963597430407, "grad_norm": 0.19032293804902783, "learning_rate": 6.4682539682539696e-06, "loss": 0.2687, "step": 1238 }, { "epoch": 2.6531049250535332, "grad_norm": 0.1799378115721631, "learning_rate": 6.428571428571429e-06, "loss": 0.2708, "step": 1239 }, { "epoch": 2.6552462526766596, "grad_norm": 0.19301414441267256, "learning_rate": 6.3888888888888885e-06, "loss": 0.2952, "step": 1240 }, { "epoch": 2.657387580299786, "grad_norm": 0.1849662307056679, "learning_rate": 6.349206349206349e-06, "loss": 0.2668, "step": 1241 }, { "epoch": 2.6595289079229123, "grad_norm": 0.18055776145465033, "learning_rate": 6.30952380952381e-06, "loss": 0.2688, "step": 1242 }, { "epoch": 2.6616702355460387, "grad_norm": 0.1886120175112293, "learning_rate": 6.26984126984127e-06, "loss": 0.2858, "step": 1243 }, { "epoch": 2.663811563169165, "grad_norm": 0.18653293808215643, "learning_rate": 6.230158730158731e-06, "loss": 0.2848, "step": 1244 }, { "epoch": 2.6659528907922914, "grad_norm": 0.1843692042836253, "learning_rate": 6.190476190476191e-06, "loss": 0.2688, "step": 1245 }, { "epoch": 2.6680942184154177, "grad_norm": 0.19256050930680374, "learning_rate": 6.1507936507936505e-06, "loss": 0.276, "step": 1246 }, { "epoch": 2.670235546038544, "grad_norm": 0.1854008621346679, "learning_rate": 6.111111111111111e-06, "loss": 0.265, "step": 1247 }, { "epoch": 2.67237687366167, "grad_norm": 0.19104484469553526, "learning_rate": 6.071428571428572e-06, "loss": 0.2758, "step": 1248 }, { "epoch": 2.674518201284797, "grad_norm": 0.18892651998233595, "learning_rate": 6.031746031746032e-06, "loss": 0.2929, "step": 1249 }, { "epoch": 2.6766595289079227, "grad_norm": 0.21181690679734574, "learning_rate": 5.992063492063493e-06, "loss": 0.2766, "step": 1250 }, { "epoch": 2.6788008565310495, "grad_norm": 0.2257805250104584, "learning_rate": 5.9523809523809525e-06, "loss": 0.2767, "step": 1251 }, { "epoch": 2.6809421841541754, "grad_norm": 0.18618064006808122, "learning_rate": 5.9126984126984124e-06, "loss": 0.2768, "step": 1252 }, { "epoch": 2.683083511777302, "grad_norm": 0.17853160960610476, "learning_rate": 5.873015873015873e-06, "loss": 0.2819, "step": 1253 }, { "epoch": 2.685224839400428, "grad_norm": 0.1853256859343023, "learning_rate": 5.833333333333334e-06, "loss": 0.285, "step": 1254 }, { "epoch": 2.6873661670235545, "grad_norm": 0.20073292717441923, "learning_rate": 5.793650793650794e-06, "loss": 0.2717, "step": 1255 }, { "epoch": 2.689507494646681, "grad_norm": 0.18795175480989068, "learning_rate": 5.753968253968254e-06, "loss": 0.2785, "step": 1256 }, { "epoch": 2.691648822269807, "grad_norm": 0.1776226753510265, "learning_rate": 5.7142857142857145e-06, "loss": 0.2713, "step": 1257 }, { "epoch": 2.6937901498929335, "grad_norm": 0.17299646249891704, "learning_rate": 5.674603174603174e-06, "loss": 0.2671, "step": 1258 }, { "epoch": 2.69593147751606, "grad_norm": 0.1740946943196013, "learning_rate": 5.634920634920635e-06, "loss": 0.2724, "step": 1259 }, { "epoch": 2.6980728051391862, "grad_norm": 0.17712448583628357, "learning_rate": 5.595238095238096e-06, "loss": 0.281, "step": 1260 }, { "epoch": 2.7002141327623126, "grad_norm": 0.18442023632973145, "learning_rate": 5.555555555555556e-06, "loss": 0.2691, "step": 1261 }, { "epoch": 2.702355460385439, "grad_norm": 0.1813575399048717, "learning_rate": 5.515873015873016e-06, "loss": 0.2823, "step": 1262 }, { "epoch": 2.7044967880085653, "grad_norm": 0.18738697864152037, "learning_rate": 5.4761904761904765e-06, "loss": 0.2791, "step": 1263 }, { "epoch": 2.7066381156316917, "grad_norm": 0.18736026207972067, "learning_rate": 5.436507936507937e-06, "loss": 0.2777, "step": 1264 }, { "epoch": 2.708779443254818, "grad_norm": 0.1769303559004464, "learning_rate": 5.396825396825397e-06, "loss": 0.2891, "step": 1265 }, { "epoch": 2.7109207708779444, "grad_norm": 0.18034406343578302, "learning_rate": 5.357142857142857e-06, "loss": 0.2749, "step": 1266 }, { "epoch": 2.7130620985010707, "grad_norm": 0.18273873357915627, "learning_rate": 5.317460317460318e-06, "loss": 0.2725, "step": 1267 }, { "epoch": 2.715203426124197, "grad_norm": 0.19359646334285074, "learning_rate": 5.277777777777778e-06, "loss": 0.2784, "step": 1268 }, { "epoch": 2.7173447537473234, "grad_norm": 0.18794519478477986, "learning_rate": 5.2380952380952384e-06, "loss": 0.275, "step": 1269 }, { "epoch": 2.71948608137045, "grad_norm": 0.18233700769386646, "learning_rate": 5.198412698412699e-06, "loss": 0.2625, "step": 1270 }, { "epoch": 2.721627408993576, "grad_norm": 0.1870952848168329, "learning_rate": 5.158730158730159e-06, "loss": 0.2792, "step": 1271 }, { "epoch": 2.7237687366167025, "grad_norm": 0.19371058024908325, "learning_rate": 5.119047619047619e-06, "loss": 0.2885, "step": 1272 }, { "epoch": 2.725910064239829, "grad_norm": 0.22178418486723533, "learning_rate": 5.07936507936508e-06, "loss": 0.2994, "step": 1273 }, { "epoch": 2.728051391862955, "grad_norm": 0.2033197772755529, "learning_rate": 5.03968253968254e-06, "loss": 0.2798, "step": 1274 }, { "epoch": 2.730192719486081, "grad_norm": 0.18764851342867586, "learning_rate": 5e-06, "loss": 0.2739, "step": 1275 }, { "epoch": 2.732334047109208, "grad_norm": 0.18851110490571485, "learning_rate": 4.96031746031746e-06, "loss": 0.2807, "step": 1276 }, { "epoch": 2.734475374732334, "grad_norm": 0.1860752013848117, "learning_rate": 4.920634920634921e-06, "loss": 0.2808, "step": 1277 }, { "epoch": 2.7366167023554606, "grad_norm": 0.1913864953828912, "learning_rate": 4.880952380952381e-06, "loss": 0.2857, "step": 1278 }, { "epoch": 2.7387580299785865, "grad_norm": 0.18956585829358527, "learning_rate": 4.841269841269842e-06, "loss": 0.2929, "step": 1279 }, { "epoch": 2.7408993576017133, "grad_norm": 0.18168087559276616, "learning_rate": 4.8015873015873025e-06, "loss": 0.2639, "step": 1280 }, { "epoch": 2.7430406852248392, "grad_norm": 0.18483118773286475, "learning_rate": 4.7619047619047615e-06, "loss": 0.2796, "step": 1281 }, { "epoch": 2.7451820128479656, "grad_norm": 0.18660137821522485, "learning_rate": 4.722222222222222e-06, "loss": 0.2767, "step": 1282 }, { "epoch": 2.747323340471092, "grad_norm": 0.1870410587872287, "learning_rate": 4.682539682539683e-06, "loss": 0.2608, "step": 1283 }, { "epoch": 2.7494646680942183, "grad_norm": 0.18934420087113804, "learning_rate": 4.642857142857143e-06, "loss": 0.2776, "step": 1284 }, { "epoch": 2.7516059957173447, "grad_norm": 0.1944710962243241, "learning_rate": 4.603174603174604e-06, "loss": 0.2819, "step": 1285 }, { "epoch": 2.753747323340471, "grad_norm": 0.18507420980354902, "learning_rate": 4.563492063492064e-06, "loss": 0.2675, "step": 1286 }, { "epoch": 2.7558886509635974, "grad_norm": 0.18671295615639139, "learning_rate": 4.5238095238095235e-06, "loss": 0.2919, "step": 1287 }, { "epoch": 2.7580299785867237, "grad_norm": 0.1820586621003967, "learning_rate": 4.484126984126984e-06, "loss": 0.276, "step": 1288 }, { "epoch": 2.76017130620985, "grad_norm": 0.18863107638734553, "learning_rate": 4.444444444444445e-06, "loss": 0.2812, "step": 1289 }, { "epoch": 2.7623126338329764, "grad_norm": 0.1886951420800492, "learning_rate": 4.404761904761905e-06, "loss": 0.2684, "step": 1290 }, { "epoch": 2.764453961456103, "grad_norm": 0.18624344207988158, "learning_rate": 4.365079365079365e-06, "loss": 0.28, "step": 1291 }, { "epoch": 2.766595289079229, "grad_norm": 0.18959188951160139, "learning_rate": 4.3253968253968256e-06, "loss": 0.2583, "step": 1292 }, { "epoch": 2.7687366167023555, "grad_norm": 0.18636470840156413, "learning_rate": 4.285714285714286e-06, "loss": 0.2816, "step": 1293 }, { "epoch": 2.770877944325482, "grad_norm": 0.18043083965620976, "learning_rate": 4.246031746031746e-06, "loss": 0.2754, "step": 1294 }, { "epoch": 2.773019271948608, "grad_norm": 0.18037368839844325, "learning_rate": 4.206349206349207e-06, "loss": 0.279, "step": 1295 }, { "epoch": 2.7751605995717346, "grad_norm": 0.1823817127914405, "learning_rate": 4.166666666666667e-06, "loss": 0.2854, "step": 1296 }, { "epoch": 2.777301927194861, "grad_norm": 0.18058730214933147, "learning_rate": 4.126984126984127e-06, "loss": 0.2734, "step": 1297 }, { "epoch": 2.7794432548179873, "grad_norm": 0.1879979042288386, "learning_rate": 4.0873015873015875e-06, "loss": 0.2717, "step": 1298 }, { "epoch": 2.7815845824411136, "grad_norm": 0.18341480124861972, "learning_rate": 4.047619047619048e-06, "loss": 0.2836, "step": 1299 }, { "epoch": 2.78372591006424, "grad_norm": 0.18324668375068373, "learning_rate": 4.007936507936508e-06, "loss": 0.286, "step": 1300 }, { "epoch": 2.7858672376873663, "grad_norm": 0.1965187754215459, "learning_rate": 3.968253968253968e-06, "loss": 0.2823, "step": 1301 }, { "epoch": 2.7880085653104922, "grad_norm": 0.18573889524253487, "learning_rate": 3.928571428571429e-06, "loss": 0.2751, "step": 1302 }, { "epoch": 2.790149892933619, "grad_norm": 0.19265696353956446, "learning_rate": 3.888888888888889e-06, "loss": 0.2699, "step": 1303 }, { "epoch": 2.792291220556745, "grad_norm": 0.19781851093500513, "learning_rate": 3.8492063492063495e-06, "loss": 0.2779, "step": 1304 }, { "epoch": 2.7944325481798717, "grad_norm": 0.18360741758477603, "learning_rate": 3.8095238095238102e-06, "loss": 0.2787, "step": 1305 }, { "epoch": 2.7965738758029977, "grad_norm": 0.1862707572751917, "learning_rate": 3.7698412698412697e-06, "loss": 0.2774, "step": 1306 }, { "epoch": 2.7987152034261245, "grad_norm": 0.17847216017441006, "learning_rate": 3.7301587301587305e-06, "loss": 0.2669, "step": 1307 }, { "epoch": 2.8008565310492504, "grad_norm": 0.1743513654767412, "learning_rate": 3.690476190476191e-06, "loss": 0.2616, "step": 1308 }, { "epoch": 2.8029978586723767, "grad_norm": 0.17678834999880497, "learning_rate": 3.650793650793651e-06, "loss": 0.2729, "step": 1309 }, { "epoch": 2.805139186295503, "grad_norm": 0.18147041110133913, "learning_rate": 3.611111111111111e-06, "loss": 0.265, "step": 1310 }, { "epoch": 2.8072805139186294, "grad_norm": 0.19719201740239473, "learning_rate": 3.5714285714285714e-06, "loss": 0.2923, "step": 1311 }, { "epoch": 2.809421841541756, "grad_norm": 0.1876418916737588, "learning_rate": 3.5317460317460317e-06, "loss": 0.2909, "step": 1312 }, { "epoch": 2.811563169164882, "grad_norm": 0.18222086562470918, "learning_rate": 3.4920634920634924e-06, "loss": 0.2643, "step": 1313 }, { "epoch": 2.8137044967880085, "grad_norm": 0.18368483933597352, "learning_rate": 3.4523809523809528e-06, "loss": 0.2758, "step": 1314 }, { "epoch": 2.815845824411135, "grad_norm": 0.1837715941113332, "learning_rate": 3.4126984126984127e-06, "loss": 0.2787, "step": 1315 }, { "epoch": 2.817987152034261, "grad_norm": 0.19049420307445103, "learning_rate": 3.373015873015873e-06, "loss": 0.2847, "step": 1316 }, { "epoch": 2.8201284796573876, "grad_norm": 0.17835342017317368, "learning_rate": 3.3333333333333333e-06, "loss": 0.2786, "step": 1317 }, { "epoch": 2.822269807280514, "grad_norm": 0.17787960285957102, "learning_rate": 3.293650793650794e-06, "loss": 0.2727, "step": 1318 }, { "epoch": 2.8244111349036403, "grad_norm": 0.17862229300209337, "learning_rate": 3.2539682539682544e-06, "loss": 0.2695, "step": 1319 }, { "epoch": 2.8265524625267666, "grad_norm": 0.1901757951555972, "learning_rate": 3.2142857142857143e-06, "loss": 0.2575, "step": 1320 }, { "epoch": 2.828693790149893, "grad_norm": 0.18103134737351187, "learning_rate": 3.1746031746031746e-06, "loss": 0.2746, "step": 1321 }, { "epoch": 2.8308351177730193, "grad_norm": 0.18702283268180547, "learning_rate": 3.134920634920635e-06, "loss": 0.2861, "step": 1322 }, { "epoch": 2.8329764453961457, "grad_norm": 0.17858016179861205, "learning_rate": 3.0952380952380953e-06, "loss": 0.2767, "step": 1323 }, { "epoch": 2.835117773019272, "grad_norm": 0.18620061639621535, "learning_rate": 3.0555555555555556e-06, "loss": 0.2815, "step": 1324 }, { "epoch": 2.8372591006423984, "grad_norm": 0.1814788975276678, "learning_rate": 3.015873015873016e-06, "loss": 0.2767, "step": 1325 }, { "epoch": 2.8394004282655247, "grad_norm": 0.18778455877330474, "learning_rate": 2.9761904761904763e-06, "loss": 0.2928, "step": 1326 }, { "epoch": 2.841541755888651, "grad_norm": 0.17098509883295968, "learning_rate": 2.9365079365079366e-06, "loss": 0.2648, "step": 1327 }, { "epoch": 2.8436830835117775, "grad_norm": 0.17520171745909632, "learning_rate": 2.896825396825397e-06, "loss": 0.2724, "step": 1328 }, { "epoch": 2.8458244111349034, "grad_norm": 0.17536544537693816, "learning_rate": 2.8571428571428573e-06, "loss": 0.2783, "step": 1329 }, { "epoch": 2.84796573875803, "grad_norm": 0.17658589120185325, "learning_rate": 2.8174603174603176e-06, "loss": 0.2758, "step": 1330 }, { "epoch": 2.850107066381156, "grad_norm": 0.17567307855848976, "learning_rate": 2.777777777777778e-06, "loss": 0.2668, "step": 1331 }, { "epoch": 2.852248394004283, "grad_norm": 0.1806128444041324, "learning_rate": 2.7380952380952382e-06, "loss": 0.2835, "step": 1332 }, { "epoch": 2.854389721627409, "grad_norm": 0.18946086509767565, "learning_rate": 2.6984126984126986e-06, "loss": 0.271, "step": 1333 }, { "epoch": 2.8565310492505356, "grad_norm": 0.17379093474956162, "learning_rate": 2.658730158730159e-06, "loss": 0.2673, "step": 1334 }, { "epoch": 2.8586723768736615, "grad_norm": 0.18610236327347768, "learning_rate": 2.6190476190476192e-06, "loss": 0.2635, "step": 1335 }, { "epoch": 2.860813704496788, "grad_norm": 0.1773051015057536, "learning_rate": 2.5793650793650795e-06, "loss": 0.2814, "step": 1336 }, { "epoch": 2.862955032119914, "grad_norm": 0.16809841728125763, "learning_rate": 2.53968253968254e-06, "loss": 0.267, "step": 1337 }, { "epoch": 2.8650963597430406, "grad_norm": 0.1737833908046207, "learning_rate": 2.5e-06, "loss": 0.2701, "step": 1338 }, { "epoch": 2.867237687366167, "grad_norm": 0.1748499028349341, "learning_rate": 2.4603174603174605e-06, "loss": 0.2723, "step": 1339 }, { "epoch": 2.8693790149892933, "grad_norm": 0.182639754025226, "learning_rate": 2.420634920634921e-06, "loss": 0.2914, "step": 1340 }, { "epoch": 2.8715203426124196, "grad_norm": 0.17803450959486153, "learning_rate": 2.3809523809523808e-06, "loss": 0.2689, "step": 1341 }, { "epoch": 2.873661670235546, "grad_norm": 0.1812081036704342, "learning_rate": 2.3412698412698415e-06, "loss": 0.2725, "step": 1342 }, { "epoch": 2.8758029978586723, "grad_norm": 0.17858190828209497, "learning_rate": 2.301587301587302e-06, "loss": 0.2809, "step": 1343 }, { "epoch": 2.8779443254817987, "grad_norm": 0.17897541722080942, "learning_rate": 2.2619047619047617e-06, "loss": 0.2827, "step": 1344 }, { "epoch": 2.880085653104925, "grad_norm": 0.18620447467008244, "learning_rate": 2.2222222222222225e-06, "loss": 0.2739, "step": 1345 }, { "epoch": 2.8822269807280514, "grad_norm": 0.17922254926198172, "learning_rate": 2.1825396825396824e-06, "loss": 0.2937, "step": 1346 }, { "epoch": 2.8843683083511777, "grad_norm": 0.1733438564135853, "learning_rate": 2.142857142857143e-06, "loss": 0.2648, "step": 1347 }, { "epoch": 2.886509635974304, "grad_norm": 0.18332268371523366, "learning_rate": 2.1031746031746035e-06, "loss": 0.2678, "step": 1348 }, { "epoch": 2.8886509635974305, "grad_norm": 0.18120143996921909, "learning_rate": 2.0634920634920634e-06, "loss": 0.2703, "step": 1349 }, { "epoch": 2.890792291220557, "grad_norm": 0.17603741176248058, "learning_rate": 2.023809523809524e-06, "loss": 0.2634, "step": 1350 }, { "epoch": 2.892933618843683, "grad_norm": 0.18073214958209138, "learning_rate": 1.984126984126984e-06, "loss": 0.2776, "step": 1351 }, { "epoch": 2.8950749464668095, "grad_norm": 0.1765653541500017, "learning_rate": 1.9444444444444444e-06, "loss": 0.27, "step": 1352 }, { "epoch": 2.897216274089936, "grad_norm": 0.17852720128484673, "learning_rate": 1.9047619047619051e-06, "loss": 0.2731, "step": 1353 }, { "epoch": 2.8993576017130622, "grad_norm": 0.17920803523071938, "learning_rate": 1.8650793650793652e-06, "loss": 0.2803, "step": 1354 }, { "epoch": 2.9014989293361886, "grad_norm": 0.17112784325767613, "learning_rate": 1.8253968253968256e-06, "loss": 0.2753, "step": 1355 }, { "epoch": 2.903640256959315, "grad_norm": 0.18339378168166692, "learning_rate": 1.7857142857142857e-06, "loss": 0.273, "step": 1356 }, { "epoch": 2.9057815845824413, "grad_norm": 0.1984295346113488, "learning_rate": 1.7460317460317462e-06, "loss": 0.2796, "step": 1357 }, { "epoch": 2.907922912205567, "grad_norm": 0.18306750932125437, "learning_rate": 1.7063492063492063e-06, "loss": 0.2844, "step": 1358 }, { "epoch": 2.910064239828694, "grad_norm": 0.18197706558527013, "learning_rate": 1.6666666666666667e-06, "loss": 0.2685, "step": 1359 }, { "epoch": 2.91220556745182, "grad_norm": 0.17423685740773412, "learning_rate": 1.6269841269841272e-06, "loss": 0.2614, "step": 1360 }, { "epoch": 2.9143468950749467, "grad_norm": 0.17655186248410631, "learning_rate": 1.5873015873015873e-06, "loss": 0.27, "step": 1361 }, { "epoch": 2.9164882226980726, "grad_norm": 0.17672011279618857, "learning_rate": 1.5476190476190476e-06, "loss": 0.2768, "step": 1362 }, { "epoch": 2.9186295503211994, "grad_norm": 0.184344574950677, "learning_rate": 1.507936507936508e-06, "loss": 0.3001, "step": 1363 }, { "epoch": 2.9207708779443253, "grad_norm": 0.17385175676196593, "learning_rate": 1.4682539682539683e-06, "loss": 0.2687, "step": 1364 }, { "epoch": 2.9229122055674517, "grad_norm": 0.17861379692559234, "learning_rate": 1.4285714285714286e-06, "loss": 0.2719, "step": 1365 }, { "epoch": 2.925053533190578, "grad_norm": 0.177882396776031, "learning_rate": 1.388888888888889e-06, "loss": 0.2772, "step": 1366 }, { "epoch": 2.9271948608137044, "grad_norm": 0.16826587147555336, "learning_rate": 1.3492063492063493e-06, "loss": 0.2701, "step": 1367 }, { "epoch": 2.9293361884368307, "grad_norm": 0.18519355648741595, "learning_rate": 1.3095238095238096e-06, "loss": 0.2992, "step": 1368 }, { "epoch": 2.931477516059957, "grad_norm": 0.1782680704296424, "learning_rate": 1.26984126984127e-06, "loss": 0.2801, "step": 1369 }, { "epoch": 2.9336188436830835, "grad_norm": 0.17474930461615157, "learning_rate": 1.2301587301587303e-06, "loss": 0.2766, "step": 1370 }, { "epoch": 2.93576017130621, "grad_norm": 0.18245581683863532, "learning_rate": 1.1904761904761904e-06, "loss": 0.2662, "step": 1371 }, { "epoch": 2.937901498929336, "grad_norm": 0.17789694691861707, "learning_rate": 1.150793650793651e-06, "loss": 0.2885, "step": 1372 }, { "epoch": 2.9400428265524625, "grad_norm": 0.1687387330562206, "learning_rate": 1.1111111111111112e-06, "loss": 0.2537, "step": 1373 }, { "epoch": 2.942184154175589, "grad_norm": 0.18142938523732535, "learning_rate": 1.0714285714285716e-06, "loss": 0.2695, "step": 1374 }, { "epoch": 2.9443254817987152, "grad_norm": 0.16915728914832637, "learning_rate": 1.0317460317460317e-06, "loss": 0.2641, "step": 1375 }, { "epoch": 2.9464668094218416, "grad_norm": 0.1780681979316246, "learning_rate": 9.92063492063492e-07, "loss": 0.2693, "step": 1376 }, { "epoch": 2.948608137044968, "grad_norm": 0.178537434553442, "learning_rate": 9.523809523809526e-07, "loss": 0.2635, "step": 1377 }, { "epoch": 2.9507494646680943, "grad_norm": 0.17999898864703598, "learning_rate": 9.126984126984128e-07, "loss": 0.2797, "step": 1378 }, { "epoch": 2.9528907922912206, "grad_norm": 0.18157617617292318, "learning_rate": 8.730158730158731e-07, "loss": 0.2778, "step": 1379 }, { "epoch": 2.955032119914347, "grad_norm": 0.180109121804787, "learning_rate": 8.333333333333333e-07, "loss": 0.2822, "step": 1380 }, { "epoch": 2.9571734475374734, "grad_norm": 0.17585603613458994, "learning_rate": 7.936507936507937e-07, "loss": 0.2796, "step": 1381 }, { "epoch": 2.9593147751605997, "grad_norm": 0.178180131081556, "learning_rate": 7.53968253968254e-07, "loss": 0.288, "step": 1382 }, { "epoch": 2.961456102783726, "grad_norm": 0.174756761248787, "learning_rate": 7.142857142857143e-07, "loss": 0.2963, "step": 1383 }, { "epoch": 2.9635974304068524, "grad_norm": 0.17507554933400857, "learning_rate": 6.746031746031746e-07, "loss": 0.2743, "step": 1384 }, { "epoch": 2.9657387580299783, "grad_norm": 0.17237340079111105, "learning_rate": 6.34920634920635e-07, "loss": 0.2812, "step": 1385 }, { "epoch": 2.967880085653105, "grad_norm": 0.17800514856704938, "learning_rate": 5.952380952380952e-07, "loss": 0.2901, "step": 1386 }, { "epoch": 2.970021413276231, "grad_norm": 0.17518183885385935, "learning_rate": 5.555555555555556e-07, "loss": 0.2817, "step": 1387 }, { "epoch": 2.972162740899358, "grad_norm": 0.19209259654412175, "learning_rate": 5.158730158730158e-07, "loss": 0.2979, "step": 1388 }, { "epoch": 2.9743040685224837, "grad_norm": 0.18038864732034635, "learning_rate": 4.761904761904763e-07, "loss": 0.2758, "step": 1389 }, { "epoch": 2.9764453961456105, "grad_norm": 0.17455696050193842, "learning_rate": 4.3650793650793655e-07, "loss": 0.2783, "step": 1390 }, { "epoch": 2.9785867237687365, "grad_norm": 0.17453553372749056, "learning_rate": 3.9682539682539683e-07, "loss": 0.2662, "step": 1391 }, { "epoch": 2.980728051391863, "grad_norm": 0.17577598361771224, "learning_rate": 3.5714285714285716e-07, "loss": 0.2856, "step": 1392 }, { "epoch": 2.982869379014989, "grad_norm": 0.16869697010362938, "learning_rate": 3.174603174603175e-07, "loss": 0.2733, "step": 1393 }, { "epoch": 2.9850107066381155, "grad_norm": 0.1776277204829826, "learning_rate": 2.777777777777778e-07, "loss": 0.2734, "step": 1394 }, { "epoch": 2.987152034261242, "grad_norm": 0.17086305942740904, "learning_rate": 2.3809523809523814e-07, "loss": 0.2697, "step": 1395 }, { "epoch": 2.9892933618843682, "grad_norm": 0.18182256735600724, "learning_rate": 1.9841269841269841e-07, "loss": 0.2862, "step": 1396 }, { "epoch": 2.9914346895074946, "grad_norm": 0.1740762738414256, "learning_rate": 1.5873015873015874e-07, "loss": 0.2888, "step": 1397 }, { "epoch": 2.993576017130621, "grad_norm": 0.1737350102448509, "learning_rate": 1.1904761904761907e-07, "loss": 0.2777, "step": 1398 }, { "epoch": 2.9957173447537473, "grad_norm": 0.16946025736659603, "learning_rate": 7.936507936507937e-08, "loss": 0.2651, "step": 1399 }, { "epoch": 2.9978586723768736, "grad_norm": 0.1722572338000261, "learning_rate": 3.9682539682539686e-08, "loss": 0.261, "step": 1400 }, { "epoch": 3.0, "grad_norm": 0.17642900461088157, "learning_rate": 0.0, "loss": 0.2644, "step": 1401 }, { "epoch": 3.0, "step": 1401, "total_flos": 1.5578375880118895e+19, "train_loss": 0.4592563231913725, "train_runtime": 43286.824, "train_samples_per_second": 0.518, "train_steps_per_second": 0.032 } ], "logging_steps": 1, "max_steps": 1401, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5578375880118895e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }