jinqij's picture
Upload folder using huggingface_hub
e0500fc verified
raw
history blame
87 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 5743,
"global_step": 17229,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 90.67318725585938,
"learning_rate": 2.901915264074289e-09,
"loss": 4.6958,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 104.22982025146484,
"learning_rate": 9.286128845037725e-08,
"loss": 5.0471,
"step": 32
},
{
"epoch": 0.01,
"grad_norm": 29.807086944580078,
"learning_rate": 1.857225769007545e-07,
"loss": 4.7906,
"step": 64
},
{
"epoch": 0.02,
"grad_norm": 25.591365814208984,
"learning_rate": 2.785838653511318e-07,
"loss": 4.3938,
"step": 96
},
{
"epoch": 0.02,
"grad_norm": 25.191415786743164,
"learning_rate": 3.71445153801509e-07,
"loss": 4.2526,
"step": 128
},
{
"epoch": 0.03,
"grad_norm": 34.85346984863281,
"learning_rate": 4.643064422518863e-07,
"loss": 4.1321,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 27.427072525024414,
"learning_rate": 5.571677307022636e-07,
"loss": 3.8133,
"step": 192
},
{
"epoch": 0.04,
"grad_norm": 21.698957443237305,
"learning_rate": 6.500290191526408e-07,
"loss": 3.8467,
"step": 224
},
{
"epoch": 0.04,
"grad_norm": 44.60033416748047,
"learning_rate": 7.42890307603018e-07,
"loss": 3.7419,
"step": 256
},
{
"epoch": 0.05,
"grad_norm": 49.22486877441406,
"learning_rate": 8.357515960533953e-07,
"loss": 3.6156,
"step": 288
},
{
"epoch": 0.06,
"grad_norm": 36.528568267822266,
"learning_rate": 9.286128845037726e-07,
"loss": 3.3479,
"step": 320
},
{
"epoch": 0.06,
"grad_norm": 48.99635696411133,
"learning_rate": 1.0214741729541498e-06,
"loss": 3.0823,
"step": 352
},
{
"epoch": 0.07,
"grad_norm": 44.24087142944336,
"learning_rate": 1.1143354614045271e-06,
"loss": 2.7423,
"step": 384
},
{
"epoch": 0.07,
"grad_norm": 50.86935806274414,
"learning_rate": 1.2071967498549044e-06,
"loss": 2.4941,
"step": 416
},
{
"epoch": 0.08,
"grad_norm": 31.083118438720703,
"learning_rate": 1.3000580383052816e-06,
"loss": 2.2775,
"step": 448
},
{
"epoch": 0.08,
"grad_norm": 35.24567413330078,
"learning_rate": 1.3929193267556587e-06,
"loss": 1.9606,
"step": 480
},
{
"epoch": 0.09,
"grad_norm": 53.384952545166016,
"learning_rate": 1.485780615206036e-06,
"loss": 1.6271,
"step": 512
},
{
"epoch": 0.09,
"grad_norm": 47.57023239135742,
"learning_rate": 1.5786419036564131e-06,
"loss": 1.3614,
"step": 544
},
{
"epoch": 0.1,
"grad_norm": 38.1913948059082,
"learning_rate": 1.6715031921067907e-06,
"loss": 1.007,
"step": 576
},
{
"epoch": 0.11,
"grad_norm": 24.733055114746094,
"learning_rate": 1.7643644805571678e-06,
"loss": 0.7387,
"step": 608
},
{
"epoch": 0.11,
"grad_norm": 20.36598777770996,
"learning_rate": 1.8572257690075451e-06,
"loss": 0.5821,
"step": 640
},
{
"epoch": 0.12,
"grad_norm": 9.40074634552002,
"learning_rate": 1.9500870574579222e-06,
"loss": 0.4921,
"step": 672
},
{
"epoch": 0.12,
"grad_norm": 9.734894752502441,
"learning_rate": 2.0429483459082996e-06,
"loss": 0.4433,
"step": 704
},
{
"epoch": 0.13,
"grad_norm": 8.26183032989502,
"learning_rate": 2.135809634358677e-06,
"loss": 0.4163,
"step": 736
},
{
"epoch": 0.13,
"grad_norm": 4.765627861022949,
"learning_rate": 2.2286709228090542e-06,
"loss": 0.403,
"step": 768
},
{
"epoch": 0.14,
"grad_norm": 3.9424140453338623,
"learning_rate": 2.321532211259431e-06,
"loss": 0.4155,
"step": 800
},
{
"epoch": 0.14,
"grad_norm": 3.6798696517944336,
"learning_rate": 2.414393499709809e-06,
"loss": 0.3943,
"step": 832
},
{
"epoch": 0.15,
"grad_norm": 3.714017868041992,
"learning_rate": 2.5072547881601862e-06,
"loss": 0.3845,
"step": 864
},
{
"epoch": 0.16,
"grad_norm": 3.57197642326355,
"learning_rate": 2.600116076610563e-06,
"loss": 0.3699,
"step": 896
},
{
"epoch": 0.16,
"grad_norm": 3.1965994834899902,
"learning_rate": 2.6929773650609405e-06,
"loss": 0.3559,
"step": 928
},
{
"epoch": 0.17,
"grad_norm": 2.980022668838501,
"learning_rate": 2.7858386535113174e-06,
"loss": 0.347,
"step": 960
},
{
"epoch": 0.17,
"grad_norm": 3.079709529876709,
"learning_rate": 2.878699941961695e-06,
"loss": 0.3415,
"step": 992
},
{
"epoch": 0.18,
"grad_norm": 2.8273375034332275,
"learning_rate": 2.971561230412072e-06,
"loss": 0.3531,
"step": 1024
},
{
"epoch": 0.18,
"grad_norm": 2.214918375015259,
"learning_rate": 3.0644225188624494e-06,
"loss": 0.3414,
"step": 1056
},
{
"epoch": 0.19,
"grad_norm": 2.4280753135681152,
"learning_rate": 3.1572838073128263e-06,
"loss": 0.3391,
"step": 1088
},
{
"epoch": 0.2,
"grad_norm": 1.955213189125061,
"learning_rate": 3.250145095763204e-06,
"loss": 0.3372,
"step": 1120
},
{
"epoch": 0.2,
"grad_norm": 2.2462759017944336,
"learning_rate": 3.3430063842135814e-06,
"loss": 0.3459,
"step": 1152
},
{
"epoch": 0.21,
"grad_norm": 2.0212900638580322,
"learning_rate": 3.4358676726639583e-06,
"loss": 0.3429,
"step": 1184
},
{
"epoch": 0.21,
"grad_norm": 1.7310266494750977,
"learning_rate": 3.5287289611143356e-06,
"loss": 0.3347,
"step": 1216
},
{
"epoch": 0.22,
"grad_norm": 2.134054183959961,
"learning_rate": 3.6215902495647133e-06,
"loss": 0.3187,
"step": 1248
},
{
"epoch": 0.22,
"grad_norm": 1.850546956062317,
"learning_rate": 3.7144515380150902e-06,
"loss": 0.3217,
"step": 1280
},
{
"epoch": 0.23,
"grad_norm": 2.3363544940948486,
"learning_rate": 3.8073128264654676e-06,
"loss": 0.3398,
"step": 1312
},
{
"epoch": 0.23,
"grad_norm": 1.707647681236267,
"learning_rate": 3.9001741149158445e-06,
"loss": 0.3132,
"step": 1344
},
{
"epoch": 0.24,
"grad_norm": 1.7750575542449951,
"learning_rate": 3.993035403366222e-06,
"loss": 0.315,
"step": 1376
},
{
"epoch": 0.25,
"grad_norm": 1.8037201166152954,
"learning_rate": 4.085896691816599e-06,
"loss": 0.3263,
"step": 1408
},
{
"epoch": 0.25,
"grad_norm": 2.307892084121704,
"learning_rate": 4.1787579802669765e-06,
"loss": 0.3119,
"step": 1440
},
{
"epoch": 0.26,
"grad_norm": 2.105405569076538,
"learning_rate": 4.271619268717354e-06,
"loss": 0.3253,
"step": 1472
},
{
"epoch": 0.26,
"grad_norm": 1.809043526649475,
"learning_rate": 4.364480557167731e-06,
"loss": 0.3293,
"step": 1504
},
{
"epoch": 0.27,
"grad_norm": 2.1056556701660156,
"learning_rate": 4.4573418456181085e-06,
"loss": 0.3091,
"step": 1536
},
{
"epoch": 0.27,
"grad_norm": 1.6170997619628906,
"learning_rate": 4.550203134068486e-06,
"loss": 0.3123,
"step": 1568
},
{
"epoch": 0.28,
"grad_norm": 1.9799582958221436,
"learning_rate": 4.643064422518862e-06,
"loss": 0.3225,
"step": 1600
},
{
"epoch": 0.28,
"grad_norm": 1.9572292566299438,
"learning_rate": 4.7359257109692405e-06,
"loss": 0.3257,
"step": 1632
},
{
"epoch": 0.29,
"grad_norm": 1.9340535402297974,
"learning_rate": 4.828786999419618e-06,
"loss": 0.3176,
"step": 1664
},
{
"epoch": 0.3,
"grad_norm": 1.9439414739608765,
"learning_rate": 4.921648287869994e-06,
"loss": 0.3114,
"step": 1696
},
{
"epoch": 0.3,
"grad_norm": 1.8130865097045898,
"learning_rate": 4.999998717225936e-06,
"loss": 0.3136,
"step": 1728
},
{
"epoch": 0.31,
"grad_norm": 1.8154480457305908,
"learning_rate": 4.999929755615174e-06,
"loss": 0.3215,
"step": 1760
},
{
"epoch": 0.31,
"grad_norm": 1.8465503454208374,
"learning_rate": 4.999755712464791e-06,
"loss": 0.3178,
"step": 1792
},
{
"epoch": 0.32,
"grad_norm": 1.7973068952560425,
"learning_rate": 4.999476595090482e-06,
"loss": 0.3196,
"step": 1824
},
{
"epoch": 0.32,
"grad_norm": 1.5011913776397705,
"learning_rate": 4.99909241522461e-06,
"loss": 0.3194,
"step": 1856
},
{
"epoch": 0.33,
"grad_norm": 1.8762191534042358,
"learning_rate": 4.998603189015714e-06,
"loss": 0.3238,
"step": 1888
},
{
"epoch": 0.33,
"grad_norm": 1.81588876247406,
"learning_rate": 4.9980089370278275e-06,
"loss": 0.328,
"step": 1920
},
{
"epoch": 0.34,
"grad_norm": 1.8873347043991089,
"learning_rate": 4.997309684239618e-06,
"loss": 0.3189,
"step": 1952
},
{
"epoch": 0.35,
"grad_norm": 1.7700365781784058,
"learning_rate": 4.996505460043337e-06,
"loss": 0.303,
"step": 1984
},
{
"epoch": 0.35,
"grad_norm": 1.6417065858840942,
"learning_rate": 4.99559629824358e-06,
"loss": 0.3155,
"step": 2016
},
{
"epoch": 0.36,
"grad_norm": 1.689789891242981,
"learning_rate": 4.99458223705587e-06,
"loss": 0.3254,
"step": 2048
},
{
"epoch": 0.36,
"grad_norm": 2.12276291847229,
"learning_rate": 4.993463319105047e-06,
"loss": 0.2986,
"step": 2080
},
{
"epoch": 0.37,
"grad_norm": 1.7065095901489258,
"learning_rate": 4.992239591423483e-06,
"loss": 0.3197,
"step": 2112
},
{
"epoch": 0.37,
"grad_norm": 2.3180906772613525,
"learning_rate": 4.990911105449098e-06,
"loss": 0.3153,
"step": 2144
},
{
"epoch": 0.38,
"grad_norm": 1.8196240663528442,
"learning_rate": 4.9894779170232024e-06,
"loss": 0.313,
"step": 2176
},
{
"epoch": 0.38,
"grad_norm": 2.0028891563415527,
"learning_rate": 4.987940086388146e-06,
"loss": 0.31,
"step": 2208
},
{
"epoch": 0.39,
"grad_norm": 1.862817645072937,
"learning_rate": 4.986297678184791e-06,
"loss": 0.3035,
"step": 2240
},
{
"epoch": 0.4,
"grad_norm": 1.7188054323196411,
"learning_rate": 4.984550761449788e-06,
"loss": 0.3504,
"step": 2272
},
{
"epoch": 0.4,
"grad_norm": 1.6241610050201416,
"learning_rate": 4.982699409612683e-06,
"loss": 0.2926,
"step": 2304
},
{
"epoch": 0.41,
"grad_norm": 1.9147684574127197,
"learning_rate": 4.980743700492822e-06,
"loss": 0.3212,
"step": 2336
},
{
"epoch": 0.41,
"grad_norm": 1.5901225805282593,
"learning_rate": 4.978683716296084e-06,
"loss": 0.3057,
"step": 2368
},
{
"epoch": 0.42,
"grad_norm": 1.670644760131836,
"learning_rate": 4.976519543611427e-06,
"loss": 0.3038,
"step": 2400
},
{
"epoch": 0.42,
"grad_norm": 1.3500678539276123,
"learning_rate": 4.974251273407246e-06,
"loss": 0.3019,
"step": 2432
},
{
"epoch": 0.43,
"grad_norm": 1.6647801399230957,
"learning_rate": 4.971879001027552e-06,
"loss": 0.3017,
"step": 2464
},
{
"epoch": 0.43,
"grad_norm": 1.793252944946289,
"learning_rate": 4.9694028261879576e-06,
"loss": 0.3068,
"step": 2496
},
{
"epoch": 0.44,
"grad_norm": 1.9570722579956055,
"learning_rate": 4.966822852971493e-06,
"loss": 0.311,
"step": 2528
},
{
"epoch": 0.45,
"grad_norm": 1.787014365196228,
"learning_rate": 4.964139189824232e-06,
"loss": 0.2992,
"step": 2560
},
{
"epoch": 0.45,
"grad_norm": 1.68462073802948,
"learning_rate": 4.961351949550722e-06,
"loss": 0.3054,
"step": 2592
},
{
"epoch": 0.46,
"grad_norm": 1.7559980154037476,
"learning_rate": 4.958461249309258e-06,
"loss": 0.3119,
"step": 2624
},
{
"epoch": 0.46,
"grad_norm": 1.9941719770431519,
"learning_rate": 4.955467210606944e-06,
"loss": 0.3122,
"step": 2656
},
{
"epoch": 0.47,
"grad_norm": 1.6227741241455078,
"learning_rate": 4.9523699592945966e-06,
"loss": 0.3094,
"step": 2688
},
{
"epoch": 0.47,
"grad_norm": 1.8108834028244019,
"learning_rate": 4.9491696255614475e-06,
"loss": 0.3079,
"step": 2720
},
{
"epoch": 0.48,
"grad_norm": 1.871539831161499,
"learning_rate": 4.945866343929675e-06,
"loss": 0.3085,
"step": 2752
},
{
"epoch": 0.48,
"grad_norm": 1.8067561388015747,
"learning_rate": 4.94246025324875e-06,
"loss": 0.3138,
"step": 2784
},
{
"epoch": 0.49,
"grad_norm": 1.4682188034057617,
"learning_rate": 4.938951496689593e-06,
"loss": 0.3141,
"step": 2816
},
{
"epoch": 0.5,
"grad_norm": 1.8749171495437622,
"learning_rate": 4.935340221738568e-06,
"loss": 0.2903,
"step": 2848
},
{
"epoch": 0.5,
"grad_norm": 1.780875563621521,
"learning_rate": 4.931626580191271e-06,
"loss": 0.3086,
"step": 2880
},
{
"epoch": 0.51,
"grad_norm": 1.468619465827942,
"learning_rate": 4.927810728146158e-06,
"loss": 0.3016,
"step": 2912
},
{
"epoch": 0.51,
"grad_norm": 1.5029264688491821,
"learning_rate": 4.923892825997976e-06,
"loss": 0.2962,
"step": 2944
},
{
"epoch": 0.52,
"grad_norm": 1.9954643249511719,
"learning_rate": 4.919873038431031e-06,
"loss": 0.3135,
"step": 2976
},
{
"epoch": 0.52,
"grad_norm": 1.7070242166519165,
"learning_rate": 4.915751534412256e-06,
"loss": 0.3216,
"step": 3008
},
{
"epoch": 0.53,
"grad_norm": 1.9191008806228638,
"learning_rate": 4.911528487184115e-06,
"loss": 0.2814,
"step": 3040
},
{
"epoch": 0.53,
"grad_norm": 1.5027116537094116,
"learning_rate": 4.9072040742573154e-06,
"loss": 0.2917,
"step": 3072
},
{
"epoch": 0.54,
"grad_norm": 1.5248404741287231,
"learning_rate": 4.902778477403354e-06,
"loss": 0.3001,
"step": 3104
},
{
"epoch": 0.55,
"grad_norm": 1.996079921722412,
"learning_rate": 4.89825188264687e-06,
"loss": 0.2951,
"step": 3136
},
{
"epoch": 0.55,
"grad_norm": 1.7733569145202637,
"learning_rate": 4.893624480257826e-06,
"loss": 0.3031,
"step": 3168
},
{
"epoch": 0.56,
"grad_norm": 1.3993550539016724,
"learning_rate": 4.888896464743515e-06,
"loss": 0.2941,
"step": 3200
},
{
"epoch": 0.56,
"grad_norm": 1.37386953830719,
"learning_rate": 4.884068034840382e-06,
"loss": 0.2917,
"step": 3232
},
{
"epoch": 0.57,
"grad_norm": 1.7720749378204346,
"learning_rate": 4.879139393505669e-06,
"loss": 0.3038,
"step": 3264
},
{
"epoch": 0.57,
"grad_norm": 1.4496855735778809,
"learning_rate": 4.874110747908883e-06,
"loss": 0.3019,
"step": 3296
},
{
"epoch": 0.58,
"grad_norm": 1.70554518699646,
"learning_rate": 4.868982309423094e-06,
"loss": 0.2824,
"step": 3328
},
{
"epoch": 0.59,
"grad_norm": 1.6426483392715454,
"learning_rate": 4.863754293616043e-06,
"loss": 0.3064,
"step": 3360
},
{
"epoch": 0.59,
"grad_norm": 1.9177954196929932,
"learning_rate": 4.858426920241083e-06,
"loss": 0.3046,
"step": 3392
},
{
"epoch": 0.6,
"grad_norm": 1.8572829961776733,
"learning_rate": 4.853000413227946e-06,
"loss": 0.3032,
"step": 3424
},
{
"epoch": 0.6,
"grad_norm": 1.548661708831787,
"learning_rate": 4.8474750006733265e-06,
"loss": 0.2997,
"step": 3456
},
{
"epoch": 0.61,
"grad_norm": 1.4153339862823486,
"learning_rate": 4.841850914831291e-06,
"loss": 0.2939,
"step": 3488
},
{
"epoch": 0.61,
"grad_norm": 1.7126343250274658,
"learning_rate": 4.836128392103524e-06,
"loss": 0.3181,
"step": 3520
},
{
"epoch": 0.62,
"grad_norm": 1.8360246419906616,
"learning_rate": 4.830307673029383e-06,
"loss": 0.3026,
"step": 3552
},
{
"epoch": 0.62,
"grad_norm": 1.6076633930206299,
"learning_rate": 4.82438900227579e-06,
"loss": 0.3092,
"step": 3584
},
{
"epoch": 0.63,
"grad_norm": 1.890261173248291,
"learning_rate": 4.8183726286269515e-06,
"loss": 0.2965,
"step": 3616
},
{
"epoch": 0.64,
"grad_norm": 1.462218999862671,
"learning_rate": 4.812258804973895e-06,
"loss": 0.2851,
"step": 3648
},
{
"epoch": 0.64,
"grad_norm": 1.5263981819152832,
"learning_rate": 4.806047788303841e-06,
"loss": 0.3002,
"step": 3680
},
{
"epoch": 0.65,
"grad_norm": 1.9360984563827515,
"learning_rate": 4.799739839689404e-06,
"loss": 0.2918,
"step": 3712
},
{
"epoch": 0.65,
"grad_norm": 1.7371431589126587,
"learning_rate": 4.7933352242776136e-06,
"loss": 0.2962,
"step": 3744
},
{
"epoch": 0.66,
"grad_norm": 2.1363890171051025,
"learning_rate": 4.786834211278775e-06,
"loss": 0.2844,
"step": 3776
},
{
"epoch": 0.66,
"grad_norm": 1.2848248481750488,
"learning_rate": 4.780237073955147e-06,
"loss": 0.2927,
"step": 3808
},
{
"epoch": 0.67,
"grad_norm": 1.685262680053711,
"learning_rate": 4.773544089609462e-06,
"loss": 0.3018,
"step": 3840
},
{
"epoch": 0.67,
"grad_norm": 1.3517770767211914,
"learning_rate": 4.766755539573261e-06,
"loss": 0.2936,
"step": 3872
},
{
"epoch": 0.68,
"grad_norm": 1.723820447921753,
"learning_rate": 4.759871709195081e-06,
"loss": 0.2974,
"step": 3904
},
{
"epoch": 0.69,
"grad_norm": 1.8562276363372803,
"learning_rate": 4.752892887828448e-06,
"loss": 0.2967,
"step": 3936
},
{
"epoch": 0.69,
"grad_norm": 1.9959107637405396,
"learning_rate": 4.745819368819723e-06,
"loss": 0.2929,
"step": 3968
},
{
"epoch": 0.7,
"grad_norm": 2.034578323364258,
"learning_rate": 4.738651449495767e-06,
"loss": 0.2978,
"step": 4000
},
{
"epoch": 0.7,
"grad_norm": 1.2981053590774536,
"learning_rate": 4.731389431151445e-06,
"loss": 0.2951,
"step": 4032
},
{
"epoch": 0.71,
"grad_norm": 1.5861812829971313,
"learning_rate": 4.72403361903696e-06,
"loss": 0.2896,
"step": 4064
},
{
"epoch": 0.71,
"grad_norm": 1.2682240009307861,
"learning_rate": 4.716584322345028e-06,
"loss": 0.2973,
"step": 4096
},
{
"epoch": 0.72,
"grad_norm": 2.147078275680542,
"learning_rate": 4.70904185419787e-06,
"loss": 0.2944,
"step": 4128
},
{
"epoch": 0.72,
"grad_norm": 1.5924174785614014,
"learning_rate": 4.7014065316340606e-06,
"loss": 0.2859,
"step": 4160
},
{
"epoch": 0.73,
"grad_norm": 1.7698993682861328,
"learning_rate": 4.693678675595199e-06,
"loss": 0.2943,
"step": 4192
},
{
"epoch": 0.74,
"grad_norm": 1.599900245666504,
"learning_rate": 4.685858610912416e-06,
"loss": 0.3045,
"step": 4224
},
{
"epoch": 0.74,
"grad_norm": 1.6287124156951904,
"learning_rate": 4.677946666292722e-06,
"loss": 0.2884,
"step": 4256
},
{
"epoch": 0.75,
"grad_norm": 1.712890386581421,
"learning_rate": 4.66994317430519e-06,
"loss": 0.2946,
"step": 4288
},
{
"epoch": 0.75,
"grad_norm": 1.7418298721313477,
"learning_rate": 4.661848471366977e-06,
"loss": 0.2933,
"step": 4320
},
{
"epoch": 0.76,
"grad_norm": 1.7066587209701538,
"learning_rate": 4.653662897729183e-06,
"loss": 0.2934,
"step": 4352
},
{
"epoch": 0.76,
"grad_norm": 1.9766113758087158,
"learning_rate": 4.645386797462547e-06,
"loss": 0.3055,
"step": 4384
},
{
"epoch": 0.77,
"grad_norm": 1.7087821960449219,
"learning_rate": 4.637020518442986e-06,
"loss": 0.2857,
"step": 4416
},
{
"epoch": 0.77,
"grad_norm": 2.085517168045044,
"learning_rate": 4.628564412336975e-06,
"loss": 0.3509,
"step": 4448
},
{
"epoch": 0.78,
"grad_norm": 1.586540699005127,
"learning_rate": 4.620018834586759e-06,
"loss": 0.3071,
"step": 4480
},
{
"epoch": 0.79,
"grad_norm": 1.5229309797286987,
"learning_rate": 4.611384144395419e-06,
"loss": 0.2978,
"step": 4512
},
{
"epoch": 0.79,
"grad_norm": 1.5610827207565308,
"learning_rate": 4.602660704711768e-06,
"loss": 0.3,
"step": 4544
},
{
"epoch": 0.8,
"grad_norm": 1.3079458475112915,
"learning_rate": 4.593848882215098e-06,
"loss": 0.3023,
"step": 4576
},
{
"epoch": 0.8,
"grad_norm": 1.243058204650879,
"learning_rate": 4.584949047299766e-06,
"loss": 0.282,
"step": 4608
},
{
"epoch": 0.81,
"grad_norm": 1.5151655673980713,
"learning_rate": 4.5759615740596265e-06,
"loss": 0.2863,
"step": 4640
},
{
"epoch": 0.81,
"grad_norm": 1.6148176193237305,
"learning_rate": 4.5668868402723024e-06,
"loss": 0.29,
"step": 4672
},
{
"epoch": 0.82,
"grad_norm": 1.3599457740783691,
"learning_rate": 4.557725227383313e-06,
"loss": 0.292,
"step": 4704
},
{
"epoch": 0.82,
"grad_norm": 1.7854079008102417,
"learning_rate": 4.548477120490031e-06,
"loss": 0.3065,
"step": 4736
},
{
"epoch": 0.83,
"grad_norm": 1.7878711223602295,
"learning_rate": 4.539142908325506e-06,
"loss": 0.2986,
"step": 4768
},
{
"epoch": 0.84,
"grad_norm": 1.9896265268325806,
"learning_rate": 4.529722983242114e-06,
"loss": 0.3025,
"step": 4800
},
{
"epoch": 0.84,
"grad_norm": 1.5168676376342773,
"learning_rate": 4.5202177411950745e-06,
"loss": 0.2847,
"step": 4832
},
{
"epoch": 0.85,
"grad_norm": 1.6856423616409302,
"learning_rate": 4.5106275817258e-06,
"loss": 0.3038,
"step": 4864
},
{
"epoch": 0.85,
"grad_norm": 1.329127550125122,
"learning_rate": 4.5009529079451085e-06,
"loss": 0.2829,
"step": 4896
},
{
"epoch": 0.86,
"grad_norm": 1.9718722105026245,
"learning_rate": 4.4911941265162695e-06,
"loss": 0.2976,
"step": 4928
},
{
"epoch": 0.86,
"grad_norm": 1.4611300230026245,
"learning_rate": 4.481351647637921e-06,
"loss": 0.2829,
"step": 4960
},
{
"epoch": 0.87,
"grad_norm": 1.7366551160812378,
"learning_rate": 4.471425885026822e-06,
"loss": 0.2916,
"step": 4992
},
{
"epoch": 0.87,
"grad_norm": 1.381406307220459,
"learning_rate": 4.46141725590046e-06,
"loss": 0.2908,
"step": 5024
},
{
"epoch": 0.88,
"grad_norm": 1.4315983057022095,
"learning_rate": 4.451326180959521e-06,
"loss": 0.2705,
"step": 5056
},
{
"epoch": 0.89,
"grad_norm": 1.7121741771697998,
"learning_rate": 4.4411530843702e-06,
"loss": 0.2956,
"step": 5088
},
{
"epoch": 0.89,
"grad_norm": 1.845123529434204,
"learning_rate": 4.430898393746371e-06,
"loss": 0.2876,
"step": 5120
},
{
"epoch": 0.9,
"grad_norm": 1.3543351888656616,
"learning_rate": 4.420562540131618e-06,
"loss": 0.2962,
"step": 5152
},
{
"epoch": 0.9,
"grad_norm": 2.028374195098877,
"learning_rate": 4.410145957981112e-06,
"loss": 0.2937,
"step": 5184
},
{
"epoch": 0.91,
"grad_norm": 1.5300021171569824,
"learning_rate": 4.399649085143354e-06,
"loss": 0.2883,
"step": 5216
},
{
"epoch": 0.91,
"grad_norm": 1.8156017065048218,
"learning_rate": 4.3890723628417605e-06,
"loss": 0.2903,
"step": 5248
},
{
"epoch": 0.92,
"grad_norm": 1.8335083723068237,
"learning_rate": 4.378416235656133e-06,
"loss": 0.2964,
"step": 5280
},
{
"epoch": 0.92,
"grad_norm": 1.6856135129928589,
"learning_rate": 4.3676811515039554e-06,
"loss": 0.284,
"step": 5312
},
{
"epoch": 0.93,
"grad_norm": 1.8075724840164185,
"learning_rate": 4.356867561621575e-06,
"loss": 0.274,
"step": 5344
},
{
"epoch": 0.94,
"grad_norm": 2.1280436515808105,
"learning_rate": 4.345975920545232e-06,
"loss": 0.2781,
"step": 5376
},
{
"epoch": 0.94,
"grad_norm": 1.8782936334609985,
"learning_rate": 4.335006686091956e-06,
"loss": 0.2796,
"step": 5408
},
{
"epoch": 0.95,
"grad_norm": 1.802985429763794,
"learning_rate": 4.323960319340321e-06,
"loss": 0.2795,
"step": 5440
},
{
"epoch": 0.95,
"grad_norm": 1.6335434913635254,
"learning_rate": 4.312837284611062e-06,
"loss": 0.3002,
"step": 5472
},
{
"epoch": 0.96,
"grad_norm": 1.5709927082061768,
"learning_rate": 4.301638049447563e-06,
"loss": 0.2912,
"step": 5504
},
{
"epoch": 0.96,
"grad_norm": 1.890569806098938,
"learning_rate": 4.290363084596199e-06,
"loss": 0.2995,
"step": 5536
},
{
"epoch": 0.97,
"grad_norm": 1.405389666557312,
"learning_rate": 4.279012863986554e-06,
"loss": 0.2818,
"step": 5568
},
{
"epoch": 0.98,
"grad_norm": 1.4392948150634766,
"learning_rate": 4.267587864711496e-06,
"loss": 0.2944,
"step": 5600
},
{
"epoch": 0.98,
"grad_norm": 1.3795379400253296,
"learning_rate": 4.256088567007123e-06,
"loss": 0.2754,
"step": 5632
},
{
"epoch": 0.99,
"grad_norm": 1.4689635038375854,
"learning_rate": 4.244515454232579e-06,
"loss": 0.2935,
"step": 5664
},
{
"epoch": 0.99,
"grad_norm": 1.6686134338378906,
"learning_rate": 4.232869012849739e-06,
"loss": 0.2945,
"step": 5696
},
{
"epoch": 1.0,
"grad_norm": 1.3574343919754028,
"learning_rate": 4.22114973240275e-06,
"loss": 0.2899,
"step": 5728
},
{
"epoch": 1.0,
"grad_norm": 1.4560518264770508,
"learning_rate": 4.20935810549747e-06,
"loss": 0.2737,
"step": 5760
},
{
"epoch": 1.01,
"grad_norm": 1.5906906127929688,
"learning_rate": 4.1974946277807485e-06,
"loss": 0.2523,
"step": 5792
},
{
"epoch": 1.01,
"grad_norm": 2.1945157051086426,
"learning_rate": 4.185559797919597e-06,
"loss": 0.2449,
"step": 5824
},
{
"epoch": 1.02,
"grad_norm": 2.203098773956299,
"learning_rate": 4.173554117580231e-06,
"loss": 0.2434,
"step": 5856
},
{
"epoch": 1.03,
"grad_norm": 2.0016815662384033,
"learning_rate": 4.16147809140698e-06,
"loss": 0.2408,
"step": 5888
},
{
"epoch": 1.03,
"grad_norm": 1.8361921310424805,
"learning_rate": 4.149332227001075e-06,
"loss": 0.2466,
"step": 5920
},
{
"epoch": 1.04,
"grad_norm": 1.681320071220398,
"learning_rate": 4.137117034899314e-06,
"loss": 0.2437,
"step": 5952
},
{
"epoch": 1.04,
"grad_norm": 2.212421178817749,
"learning_rate": 4.124833028552601e-06,
"loss": 0.2488,
"step": 5984
},
{
"epoch": 1.05,
"grad_norm": 3.8892650604248047,
"learning_rate": 4.112480724304362e-06,
"loss": 0.2382,
"step": 6016
},
{
"epoch": 1.05,
"grad_norm": 1.904011607170105,
"learning_rate": 4.100060641368848e-06,
"loss": 0.2461,
"step": 6048
},
{
"epoch": 1.06,
"grad_norm": 1.6115171909332275,
"learning_rate": 4.087573301809301e-06,
"loss": 0.2475,
"step": 6080
},
{
"epoch": 1.06,
"grad_norm": 1.625502109527588,
"learning_rate": 4.075019230516016e-06,
"loss": 0.2535,
"step": 6112
},
{
"epoch": 1.07,
"grad_norm": 1.7666929960250854,
"learning_rate": 4.062398955184277e-06,
"loss": 0.2471,
"step": 6144
},
{
"epoch": 1.08,
"grad_norm": 1.4440840482711792,
"learning_rate": 4.049713006292174e-06,
"loss": 0.2346,
"step": 6176
},
{
"epoch": 1.08,
"grad_norm": 1.6042975187301636,
"learning_rate": 4.036961917078305e-06,
"loss": 0.2452,
"step": 6208
},
{
"epoch": 1.09,
"grad_norm": 1.5292000770568848,
"learning_rate": 4.024146223519365e-06,
"loss": 0.2303,
"step": 6240
},
{
"epoch": 1.09,
"grad_norm": 1.6863032579421997,
"learning_rate": 4.011266464307615e-06,
"loss": 0.2454,
"step": 6272
},
{
"epoch": 1.1,
"grad_norm": 1.8112516403198242,
"learning_rate": 3.998323180828236e-06,
"loss": 0.2335,
"step": 6304
},
{
"epoch": 1.1,
"grad_norm": 1.5118499994277954,
"learning_rate": 3.985316917136579e-06,
"loss": 0.2534,
"step": 6336
},
{
"epoch": 1.11,
"grad_norm": 1.620071291923523,
"learning_rate": 3.972248219935289e-06,
"loss": 0.2311,
"step": 6368
},
{
"epoch": 1.11,
"grad_norm": 1.4030137062072754,
"learning_rate": 3.959117638551331e-06,
"loss": 0.2411,
"step": 6400
},
{
"epoch": 1.12,
"grad_norm": 1.6774815320968628,
"learning_rate": 3.945925724912896e-06,
"loss": 0.2546,
"step": 6432
},
{
"epoch": 1.13,
"grad_norm": 2.215942144393921,
"learning_rate": 3.932673033526203e-06,
"loss": 0.2453,
"step": 6464
},
{
"epoch": 1.13,
"grad_norm": 1.5397557020187378,
"learning_rate": 3.919360121452188e-06,
"loss": 0.2455,
"step": 6496
},
{
"epoch": 1.14,
"grad_norm": 1.6343345642089844,
"learning_rate": 3.905987548283097e-06,
"loss": 0.2345,
"step": 6528
},
{
"epoch": 1.14,
"grad_norm": 1.7040832042694092,
"learning_rate": 3.892555876118951e-06,
"loss": 0.2456,
"step": 6560
},
{
"epoch": 1.15,
"grad_norm": 1.6422055959701538,
"learning_rate": 3.879065669543931e-06,
"loss": 0.2464,
"step": 6592
},
{
"epoch": 1.15,
"grad_norm": 1.47724187374115,
"learning_rate": 3.865517495602642e-06,
"loss": 0.2379,
"step": 6624
},
{
"epoch": 1.16,
"grad_norm": 1.5706995725631714,
"learning_rate": 3.851911923776274e-06,
"loss": 0.2542,
"step": 6656
},
{
"epoch": 1.16,
"grad_norm": 1.3481061458587646,
"learning_rate": 3.83824952595867e-06,
"loss": 0.2358,
"step": 6688
},
{
"epoch": 1.17,
"grad_norm": 1.7855818271636963,
"learning_rate": 3.824530876432287e-06,
"loss": 0.2506,
"step": 6720
},
{
"epoch": 1.18,
"grad_norm": 1.661627173423767,
"learning_rate": 3.81075655184405e-06,
"loss": 0.2477,
"step": 6752
},
{
"epoch": 1.18,
"grad_norm": 1.3250243663787842,
"learning_rate": 3.796927131181124e-06,
"loss": 0.236,
"step": 6784
},
{
"epoch": 1.19,
"grad_norm": 1.5946906805038452,
"learning_rate": 3.7830431957465673e-06,
"loss": 0.2405,
"step": 6816
},
{
"epoch": 1.19,
"grad_norm": 1.4719637632369995,
"learning_rate": 3.7691053291349012e-06,
"loss": 0.2444,
"step": 6848
},
{
"epoch": 1.2,
"grad_norm": 2.088393211364746,
"learning_rate": 3.755114117207582e-06,
"loss": 0.238,
"step": 6880
},
{
"epoch": 1.2,
"grad_norm": 1.853722333908081,
"learning_rate": 3.7410701480683693e-06,
"loss": 0.2496,
"step": 6912
},
{
"epoch": 1.21,
"grad_norm": 1.338397741317749,
"learning_rate": 3.726974012038609e-06,
"loss": 0.2482,
"step": 6944
},
{
"epoch": 1.21,
"grad_norm": 1.6698582172393799,
"learning_rate": 3.7128263016324205e-06,
"loss": 0.2501,
"step": 6976
},
{
"epoch": 1.22,
"grad_norm": 2.1817092895507812,
"learning_rate": 3.698627611531791e-06,
"loss": 0.2382,
"step": 7008
},
{
"epoch": 1.23,
"grad_norm": 1.6343122720718384,
"learning_rate": 3.684378538561575e-06,
"loss": 0.2447,
"step": 7040
},
{
"epoch": 1.23,
"grad_norm": 1.335663080215454,
"learning_rate": 3.6700796816644115e-06,
"loss": 0.245,
"step": 7072
},
{
"epoch": 1.24,
"grad_norm": 1.7304885387420654,
"learning_rate": 3.655731641875549e-06,
"loss": 0.2541,
"step": 7104
},
{
"epoch": 1.24,
"grad_norm": 1.6443067789077759,
"learning_rate": 3.641335022297576e-06,
"loss": 0.249,
"step": 7136
},
{
"epoch": 1.25,
"grad_norm": 1.5171420574188232,
"learning_rate": 3.626890428075077e-06,
"loss": 0.248,
"step": 7168
},
{
"epoch": 1.25,
"grad_norm": 1.6931904554367065,
"learning_rate": 3.6123984663691925e-06,
"loss": 0.2404,
"step": 7200
},
{
"epoch": 1.26,
"grad_norm": 1.7795478105545044,
"learning_rate": 3.5978597463320964e-06,
"loss": 0.2462,
"step": 7232
},
{
"epoch": 1.26,
"grad_norm": 1.7948668003082275,
"learning_rate": 3.5832748790813945e-06,
"loss": 0.2507,
"step": 7264
},
{
"epoch": 1.27,
"grad_norm": 1.6601665019989014,
"learning_rate": 3.5686444776744346e-06,
"loss": 0.2463,
"step": 7296
},
{
"epoch": 1.28,
"grad_norm": 1.675183892250061,
"learning_rate": 3.5539691570825374e-06,
"loss": 0.2514,
"step": 7328
},
{
"epoch": 1.28,
"grad_norm": 1.5422120094299316,
"learning_rate": 3.5392495341651497e-06,
"loss": 0.247,
"step": 7360
},
{
"epoch": 1.29,
"grad_norm": 1.522061824798584,
"learning_rate": 3.5244862276439102e-06,
"loss": 0.2455,
"step": 7392
},
{
"epoch": 1.29,
"grad_norm": 1.5597782135009766,
"learning_rate": 3.5096798580766476e-06,
"loss": 0.2514,
"step": 7424
},
{
"epoch": 1.3,
"grad_norm": 2.3420979976654053,
"learning_rate": 3.494831047831293e-06,
"loss": 0.2486,
"step": 7456
},
{
"epoch": 1.3,
"grad_norm": 1.397116780281067,
"learning_rate": 3.479940421059721e-06,
"loss": 0.2388,
"step": 7488
},
{
"epoch": 1.31,
"grad_norm": 1.9889482259750366,
"learning_rate": 3.4650086036715123e-06,
"loss": 0.2521,
"step": 7520
},
{
"epoch": 1.31,
"grad_norm": 1.3389945030212402,
"learning_rate": 3.450036223307647e-06,
"loss": 0.254,
"step": 7552
},
{
"epoch": 1.32,
"grad_norm": 1.550194263458252,
"learning_rate": 3.435023909314119e-06,
"loss": 0.2411,
"step": 7584
},
{
"epoch": 1.33,
"grad_norm": 1.39427649974823,
"learning_rate": 3.4199722927154876e-06,
"loss": 0.2421,
"step": 7616
},
{
"epoch": 1.33,
"grad_norm": 1.647886037826538,
"learning_rate": 3.4048820061883475e-06,
"loss": 0.2546,
"step": 7648
},
{
"epoch": 1.34,
"grad_norm": 1.4851469993591309,
"learning_rate": 3.3897536840347384e-06,
"loss": 0.2543,
"step": 7680
},
{
"epoch": 1.34,
"grad_norm": 1.5985283851623535,
"learning_rate": 3.3745879621554793e-06,
"loss": 0.2505,
"step": 7712
},
{
"epoch": 1.35,
"grad_norm": 1.467861294746399,
"learning_rate": 3.3593854780234446e-06,
"loss": 0.2408,
"step": 7744
},
{
"epoch": 1.35,
"grad_norm": 1.5086297988891602,
"learning_rate": 3.3441468706567655e-06,
"loss": 0.2492,
"step": 7776
},
{
"epoch": 1.36,
"grad_norm": 1.6549632549285889,
"learning_rate": 3.328872780591968e-06,
"loss": 0.2431,
"step": 7808
},
{
"epoch": 1.37,
"grad_norm": 1.413674235343933,
"learning_rate": 3.313563849857052e-06,
"loss": 0.2539,
"step": 7840
},
{
"epoch": 1.37,
"grad_norm": 1.547775387763977,
"learning_rate": 3.298220721944504e-06,
"loss": 0.2376,
"step": 7872
},
{
"epoch": 1.38,
"grad_norm": 1.834301471710205,
"learning_rate": 3.282844041784245e-06,
"loss": 0.2465,
"step": 7904
},
{
"epoch": 1.38,
"grad_norm": 1.5492215156555176,
"learning_rate": 3.2674344557165268e-06,
"loss": 0.2462,
"step": 7936
},
{
"epoch": 1.39,
"grad_norm": 1.691975474357605,
"learning_rate": 3.2519926114647597e-06,
"loss": 0.2423,
"step": 7968
},
{
"epoch": 1.39,
"grad_norm": 1.4883307218551636,
"learning_rate": 3.2365191581082894e-06,
"loss": 0.2431,
"step": 8000
},
{
"epoch": 1.4,
"grad_norm": 1.9474862813949585,
"learning_rate": 3.221014746055112e-06,
"loss": 0.2409,
"step": 8032
},
{
"epoch": 1.4,
"grad_norm": 1.7408241033554077,
"learning_rate": 3.205480027014535e-06,
"loss": 0.2529,
"step": 8064
},
{
"epoch": 1.41,
"grad_norm": 1.9496859312057495,
"learning_rate": 3.1899156539697817e-06,
"loss": 0.244,
"step": 8096
},
{
"epoch": 1.42,
"grad_norm": 1.5115084648132324,
"learning_rate": 3.174322281150549e-06,
"loss": 0.2355,
"step": 8128
},
{
"epoch": 1.42,
"grad_norm": 1.5742942094802856,
"learning_rate": 3.1587005640055035e-06,
"loss": 0.237,
"step": 8160
},
{
"epoch": 1.43,
"grad_norm": 1.6793855428695679,
"learning_rate": 3.14305115917473e-06,
"loss": 0.2357,
"step": 8192
},
{
"epoch": 1.43,
"grad_norm": 1.359031081199646,
"learning_rate": 3.1273747244621333e-06,
"loss": 0.2507,
"step": 8224
},
{
"epoch": 1.44,
"grad_norm": 1.3813912868499756,
"learning_rate": 3.1116719188077867e-06,
"loss": 0.2455,
"step": 8256
},
{
"epoch": 1.44,
"grad_norm": 1.2507054805755615,
"learning_rate": 3.0959434022602326e-06,
"loss": 0.2376,
"step": 8288
},
{
"epoch": 1.45,
"grad_norm": 1.6926429271697998,
"learning_rate": 3.080189835948742e-06,
"loss": 0.243,
"step": 8320
},
{
"epoch": 1.45,
"grad_norm": 1.7547937631607056,
"learning_rate": 3.0644118820555217e-06,
"loss": 0.2418,
"step": 8352
},
{
"epoch": 1.46,
"grad_norm": 1.5827124118804932,
"learning_rate": 3.048610203787881e-06,
"loss": 0.2422,
"step": 8384
},
{
"epoch": 1.47,
"grad_norm": 1.5525377988815308,
"learning_rate": 3.0327854653503554e-06,
"loss": 0.2382,
"step": 8416
},
{
"epoch": 1.47,
"grad_norm": 1.502051830291748,
"learning_rate": 3.0169383319167866e-06,
"loss": 0.2343,
"step": 8448
},
{
"epoch": 1.48,
"grad_norm": 1.5819698572158813,
"learning_rate": 3.001069469602361e-06,
"loss": 0.2307,
"step": 8480
},
{
"epoch": 1.48,
"grad_norm": 1.678463339805603,
"learning_rate": 2.9851795454356164e-06,
"loss": 0.2395,
"step": 8512
},
{
"epoch": 1.49,
"grad_norm": 1.5477441549301147,
"learning_rate": 2.969269227330397e-06,
"loss": 0.2458,
"step": 8544
},
{
"epoch": 1.49,
"grad_norm": 1.6442259550094604,
"learning_rate": 2.953339184057783e-06,
"loss": 0.2454,
"step": 8576
},
{
"epoch": 1.5,
"grad_norm": 1.751511573791504,
"learning_rate": 2.9373900852179784e-06,
"loss": 0.2286,
"step": 8608
},
{
"epoch": 1.5,
"grad_norm": 1.5601673126220703,
"learning_rate": 2.9214226012121638e-06,
"loss": 0.2389,
"step": 8640
},
{
"epoch": 1.51,
"grad_norm": 1.3844302892684937,
"learning_rate": 2.905437403214319e-06,
"loss": 0.2454,
"step": 8672
},
{
"epoch": 1.52,
"grad_norm": 1.2684638500213623,
"learning_rate": 2.88943516314301e-06,
"loss": 0.2376,
"step": 8704
},
{
"epoch": 1.52,
"grad_norm": 2.507758617401123,
"learning_rate": 2.873416553633147e-06,
"loss": 0.2269,
"step": 8736
},
{
"epoch": 1.53,
"grad_norm": 1.6755341291427612,
"learning_rate": 2.857382248007708e-06,
"loss": 0.2236,
"step": 8768
},
{
"epoch": 1.53,
"grad_norm": 1.9231692552566528,
"learning_rate": 2.8413329202494396e-06,
"loss": 0.2541,
"step": 8800
},
{
"epoch": 1.54,
"grad_norm": 1.528493046760559,
"learning_rate": 2.825269244972525e-06,
"loss": 0.2271,
"step": 8832
},
{
"epoch": 1.54,
"grad_norm": 1.3824098110198975,
"learning_rate": 2.8091918973942276e-06,
"loss": 0.2451,
"step": 8864
},
{
"epoch": 1.55,
"grad_norm": 1.4477118253707886,
"learning_rate": 2.7931015533065116e-06,
"loss": 0.2382,
"step": 8896
},
{
"epoch": 1.55,
"grad_norm": 1.9449779987335205,
"learning_rate": 2.776998889047631e-06,
"loss": 0.2543,
"step": 8928
},
{
"epoch": 1.56,
"grad_norm": 1.2107715606689453,
"learning_rate": 2.760884581473706e-06,
"loss": 0.2388,
"step": 8960
},
{
"epoch": 1.57,
"grad_norm": 1.4030863046646118,
"learning_rate": 2.744759307930268e-06,
"loss": 0.24,
"step": 8992
},
{
"epoch": 1.57,
"grad_norm": 1.8753631114959717,
"learning_rate": 2.7286237462237907e-06,
"loss": 0.2401,
"step": 9024
},
{
"epoch": 1.58,
"grad_norm": 1.5679469108581543,
"learning_rate": 2.7124785745931974e-06,
"loss": 0.2519,
"step": 9056
},
{
"epoch": 1.58,
"grad_norm": 1.4486414194107056,
"learning_rate": 2.696324471681353e-06,
"loss": 0.2457,
"step": 9088
},
{
"epoch": 1.59,
"grad_norm": 1.8142122030258179,
"learning_rate": 2.6801621165065384e-06,
"loss": 0.2385,
"step": 9120
},
{
"epoch": 1.59,
"grad_norm": 1.4930641651153564,
"learning_rate": 2.6639921884339094e-06,
"loss": 0.2411,
"step": 9152
},
{
"epoch": 1.6,
"grad_norm": 1.5598373413085938,
"learning_rate": 2.647815367146937e-06,
"loss": 0.2265,
"step": 9184
},
{
"epoch": 1.6,
"grad_norm": 1.7909144163131714,
"learning_rate": 2.631632332618844e-06,
"loss": 0.2502,
"step": 9216
},
{
"epoch": 1.61,
"grad_norm": 1.6813157796859741,
"learning_rate": 2.6154437650840153e-06,
"loss": 0.2368,
"step": 9248
},
{
"epoch": 1.62,
"grad_norm": 1.4486907720565796,
"learning_rate": 2.599250345009411e-06,
"loss": 0.2423,
"step": 9280
},
{
"epoch": 1.62,
"grad_norm": 1.7712582349777222,
"learning_rate": 2.583052753065962e-06,
"loss": 0.2312,
"step": 9312
},
{
"epoch": 1.63,
"grad_norm": 1.3970569372177124,
"learning_rate": 2.5668516700999585e-06,
"loss": 0.2415,
"step": 9344
},
{
"epoch": 1.63,
"grad_norm": 1.3571640253067017,
"learning_rate": 2.5506477771044313e-06,
"loss": 0.2254,
"step": 9376
},
{
"epoch": 1.64,
"grad_norm": 1.6543464660644531,
"learning_rate": 2.5344417551905276e-06,
"loss": 0.2358,
"step": 9408
},
{
"epoch": 1.64,
"grad_norm": 2.0646538734436035,
"learning_rate": 2.518234285558882e-06,
"loss": 0.2433,
"step": 9440
},
{
"epoch": 1.65,
"grad_norm": 1.5231086015701294,
"learning_rate": 2.50202604947098e-06,
"loss": 0.2289,
"step": 9472
},
{
"epoch": 1.65,
"grad_norm": 1.7834982872009277,
"learning_rate": 2.485817728220526e-06,
"loss": 0.2366,
"step": 9504
},
{
"epoch": 1.66,
"grad_norm": 1.4642645120620728,
"learning_rate": 2.469610003104804e-06,
"loss": 0.2354,
"step": 9536
},
{
"epoch": 1.67,
"grad_norm": 1.4511359930038452,
"learning_rate": 2.453403555396038e-06,
"loss": 0.2394,
"step": 9568
},
{
"epoch": 1.67,
"grad_norm": 1.9833004474639893,
"learning_rate": 2.4371990663127613e-06,
"loss": 0.2453,
"step": 9600
},
{
"epoch": 1.68,
"grad_norm": 2.215998888015747,
"learning_rate": 2.420997216991178e-06,
"loss": 0.2345,
"step": 9632
},
{
"epoch": 1.68,
"grad_norm": 1.5565507411956787,
"learning_rate": 2.404798688456529e-06,
"loss": 0.2465,
"step": 9664
},
{
"epoch": 1.69,
"grad_norm": 1.583949327468872,
"learning_rate": 2.3886041615944753e-06,
"loss": 0.2442,
"step": 9696
},
{
"epoch": 1.69,
"grad_norm": 1.6056400537490845,
"learning_rate": 2.3724143171224684e-06,
"loss": 0.2346,
"step": 9728
},
{
"epoch": 1.7,
"grad_norm": 1.5098297595977783,
"learning_rate": 2.3562298355611444e-06,
"loss": 0.2444,
"step": 9760
},
{
"epoch": 1.71,
"grad_norm": 1.4785221815109253,
"learning_rate": 2.3400513972057117e-06,
"loss": 0.245,
"step": 9792
},
{
"epoch": 1.71,
"grad_norm": 1.4235949516296387,
"learning_rate": 2.323879682097365e-06,
"loss": 0.2386,
"step": 9824
},
{
"epoch": 1.72,
"grad_norm": 1.4386334419250488,
"learning_rate": 2.3077153699946912e-06,
"loss": 0.2303,
"step": 9856
},
{
"epoch": 1.72,
"grad_norm": 1.6986160278320312,
"learning_rate": 2.291559140345102e-06,
"loss": 0.2329,
"step": 9888
},
{
"epoch": 1.73,
"grad_norm": 1.5963070392608643,
"learning_rate": 2.2754116722562756e-06,
"loss": 0.2352,
"step": 9920
},
{
"epoch": 1.73,
"grad_norm": 1.5542904138565063,
"learning_rate": 2.2592736444676035e-06,
"loss": 0.2378,
"step": 9952
},
{
"epoch": 1.74,
"grad_norm": 2.3971989154815674,
"learning_rate": 2.243145735321669e-06,
"loss": 0.2347,
"step": 9984
},
{
"epoch": 1.74,
"grad_norm": 1.5635007619857788,
"learning_rate": 2.2270286227357306e-06,
"loss": 0.2406,
"step": 10016
},
{
"epoch": 1.75,
"grad_norm": 1.545464038848877,
"learning_rate": 2.210922984173223e-06,
"loss": 0.2329,
"step": 10048
},
{
"epoch": 1.76,
"grad_norm": 1.3175172805786133,
"learning_rate": 2.19482949661529e-06,
"loss": 0.2448,
"step": 10080
},
{
"epoch": 1.76,
"grad_norm": 1.4598995447158813,
"learning_rate": 2.1787488365323163e-06,
"loss": 0.2294,
"step": 10112
},
{
"epoch": 1.77,
"grad_norm": 1.50557279586792,
"learning_rate": 2.1626816798555035e-06,
"loss": 0.2427,
"step": 10144
},
{
"epoch": 1.77,
"grad_norm": 1.596571683883667,
"learning_rate": 2.14662870194845e-06,
"loss": 0.2187,
"step": 10176
},
{
"epoch": 1.78,
"grad_norm": 1.5053282976150513,
"learning_rate": 2.1305905775787713e-06,
"loss": 0.2394,
"step": 10208
},
{
"epoch": 1.78,
"grad_norm": 1.6713193655014038,
"learning_rate": 2.1145679808897297e-06,
"loss": 0.2333,
"step": 10240
},
{
"epoch": 1.79,
"grad_norm": 1.394679069519043,
"learning_rate": 2.098561585371898e-06,
"loss": 0.236,
"step": 10272
},
{
"epoch": 1.79,
"grad_norm": 1.6781407594680786,
"learning_rate": 2.082572063834857e-06,
"loss": 0.2461,
"step": 10304
},
{
"epoch": 1.8,
"grad_norm": 1.6299934387207031,
"learning_rate": 2.066600088378906e-06,
"loss": 0.2464,
"step": 10336
},
{
"epoch": 1.81,
"grad_norm": 1.309910774230957,
"learning_rate": 2.0506463303668182e-06,
"loss": 0.2395,
"step": 10368
},
{
"epoch": 1.81,
"grad_norm": 1.5353403091430664,
"learning_rate": 2.0347114603956184e-06,
"loss": 0.2396,
"step": 10400
},
{
"epoch": 1.82,
"grad_norm": 1.5149140357971191,
"learning_rate": 2.018796148268393e-06,
"loss": 0.2267,
"step": 10432
},
{
"epoch": 1.82,
"grad_norm": 1.8665707111358643,
"learning_rate": 2.002901062966141e-06,
"loss": 0.2343,
"step": 10464
},
{
"epoch": 1.83,
"grad_norm": 1.7906211614608765,
"learning_rate": 1.9870268726196493e-06,
"loss": 0.2332,
"step": 10496
},
{
"epoch": 1.83,
"grad_norm": 1.6433167457580566,
"learning_rate": 1.971174244481411e-06,
"loss": 0.2387,
"step": 10528
},
{
"epoch": 1.84,
"grad_norm": 1.7116189002990723,
"learning_rate": 1.9553438448975766e-06,
"loss": 0.244,
"step": 10560
},
{
"epoch": 1.84,
"grad_norm": 1.4329649209976196,
"learning_rate": 1.9395363392799486e-06,
"loss": 0.2301,
"step": 10592
},
{
"epoch": 1.85,
"grad_norm": 1.4449670314788818,
"learning_rate": 1.9237523920780077e-06,
"loss": 0.2258,
"step": 10624
},
{
"epoch": 1.86,
"grad_norm": 1.8179880380630493,
"learning_rate": 1.9079926667509833e-06,
"loss": 0.2353,
"step": 10656
},
{
"epoch": 1.86,
"grad_norm": 1.6381878852844238,
"learning_rate": 1.892257825739971e-06,
"loss": 0.2263,
"step": 10688
},
{
"epoch": 1.87,
"grad_norm": 1.6362228393554688,
"learning_rate": 1.8765485304400804e-06,
"loss": 0.2312,
"step": 10720
},
{
"epoch": 1.87,
"grad_norm": 1.6400152444839478,
"learning_rate": 1.8608654411726407e-06,
"loss": 0.2313,
"step": 10752
},
{
"epoch": 1.88,
"grad_norm": 1.1943421363830566,
"learning_rate": 1.8452092171574418e-06,
"loss": 0.2239,
"step": 10784
},
{
"epoch": 1.88,
"grad_norm": 1.8176000118255615,
"learning_rate": 1.8295805164850217e-06,
"loss": 0.231,
"step": 10816
},
{
"epoch": 1.89,
"grad_norm": 1.5386502742767334,
"learning_rate": 1.8139799960890132e-06,
"loss": 0.233,
"step": 10848
},
{
"epoch": 1.89,
"grad_norm": 1.666212558746338,
"learning_rate": 1.79840831171852e-06,
"loss": 0.2387,
"step": 10880
},
{
"epoch": 1.9,
"grad_norm": 2.0908565521240234,
"learning_rate": 1.7828661179105618e-06,
"loss": 0.2301,
"step": 10912
},
{
"epoch": 1.91,
"grad_norm": 1.824118733406067,
"learning_rate": 1.767354067962555e-06,
"loss": 0.2211,
"step": 10944
},
{
"epoch": 1.91,
"grad_norm": 1.6803866624832153,
"learning_rate": 1.7518728139048585e-06,
"loss": 0.2283,
"step": 10976
},
{
"epoch": 1.92,
"grad_norm": 1.5006029605865479,
"learning_rate": 1.7364230064733606e-06,
"loss": 0.2376,
"step": 11008
},
{
"epoch": 1.92,
"grad_norm": 1.6305315494537354,
"learning_rate": 1.7210052950821276e-06,
"loss": 0.2274,
"step": 11040
},
{
"epoch": 1.93,
"grad_norm": 1.4924837350845337,
"learning_rate": 1.7056203277961112e-06,
"loss": 0.2301,
"step": 11072
},
{
"epoch": 1.93,
"grad_norm": 1.7117606401443481,
"learning_rate": 1.6902687513039002e-06,
"loss": 0.2391,
"step": 11104
},
{
"epoch": 1.94,
"grad_norm": 1.9313896894454956,
"learning_rate": 1.6749512108905424e-06,
"loss": 0.2268,
"step": 11136
},
{
"epoch": 1.94,
"grad_norm": 1.4171528816223145,
"learning_rate": 1.6596683504104228e-06,
"loss": 0.2327,
"step": 11168
},
{
"epoch": 1.95,
"grad_norm": 1.576985478401184,
"learning_rate": 1.6444208122601933e-06,
"loss": 0.2352,
"step": 11200
},
{
"epoch": 1.96,
"grad_norm": 1.453007698059082,
"learning_rate": 1.6292092373517765e-06,
"loss": 0.2315,
"step": 11232
},
{
"epoch": 1.96,
"grad_norm": 1.5981701612472534,
"learning_rate": 1.6140342650854218e-06,
"loss": 0.2304,
"step": 11264
},
{
"epoch": 1.97,
"grad_norm": 2.0827910900115967,
"learning_rate": 1.5988965333228323e-06,
"loss": 0.2338,
"step": 11296
},
{
"epoch": 1.97,
"grad_norm": 1.5926817655563354,
"learning_rate": 1.5837966783603494e-06,
"loss": 0.2498,
"step": 11328
},
{
"epoch": 1.98,
"grad_norm": 1.82351815700531,
"learning_rate": 1.5687353349022114e-06,
"loss": 0.2379,
"step": 11360
},
{
"epoch": 1.98,
"grad_norm": 1.5781277418136597,
"learning_rate": 1.5537131360338697e-06,
"loss": 0.2421,
"step": 11392
},
{
"epoch": 1.99,
"grad_norm": 1.4788808822631836,
"learning_rate": 1.5387307131953793e-06,
"loss": 0.2371,
"step": 11424
},
{
"epoch": 1.99,
"grad_norm": 1.8411000967025757,
"learning_rate": 1.5237886961548615e-06,
"loss": 0.2371,
"step": 11456
},
{
"epoch": 2.0,
"grad_norm": 1.7383184432983398,
"learning_rate": 1.508887712982024e-06,
"loss": 0.2307,
"step": 11488
},
{
"epoch": 2.01,
"grad_norm": 1.6967484951019287,
"learning_rate": 1.4940283900217667e-06,
"loss": 0.1677,
"step": 11520
},
{
"epoch": 2.01,
"grad_norm": 1.5547412633895874,
"learning_rate": 1.4792113518678553e-06,
"loss": 0.1562,
"step": 11552
},
{
"epoch": 2.02,
"grad_norm": 1.484421730041504,
"learning_rate": 1.46443722133666e-06,
"loss": 0.1644,
"step": 11584
},
{
"epoch": 2.02,
"grad_norm": 1.7713335752487183,
"learning_rate": 1.4497066194409849e-06,
"loss": 0.1703,
"step": 11616
},
{
"epoch": 2.03,
"grad_norm": 1.6081045866012573,
"learning_rate": 1.435020165363956e-06,
"loss": 0.1621,
"step": 11648
},
{
"epoch": 2.03,
"grad_norm": 1.6781724691390991,
"learning_rate": 1.4203784764330033e-06,
"loss": 0.1706,
"step": 11680
},
{
"epoch": 2.04,
"grad_norm": 1.5613493919372559,
"learning_rate": 1.4057821680939049e-06,
"loss": 0.1594,
"step": 11712
},
{
"epoch": 2.04,
"grad_norm": 1.6171530485153198,
"learning_rate": 1.3912318538849207e-06,
"loss": 0.1598,
"step": 11744
},
{
"epoch": 2.05,
"grad_norm": 1.4825495481491089,
"learning_rate": 1.3767281454110037e-06,
"loss": 0.1738,
"step": 11776
},
{
"epoch": 2.06,
"grad_norm": 1.7642580270767212,
"learning_rate": 1.3622716523180898e-06,
"loss": 0.1563,
"step": 11808
},
{
"epoch": 2.06,
"grad_norm": 1.3485716581344604,
"learning_rate": 1.347862982267475e-06,
"loss": 0.1571,
"step": 11840
},
{
"epoch": 2.07,
"grad_norm": 1.6963993310928345,
"learning_rate": 1.3335027409102663e-06,
"loss": 0.1567,
"step": 11872
},
{
"epoch": 2.07,
"grad_norm": 1.5955970287322998,
"learning_rate": 1.3191915318619357e-06,
"loss": 0.1675,
"step": 11904
},
{
"epoch": 2.08,
"grad_norm": 2.1548969745635986,
"learning_rate": 1.304929956676938e-06,
"loss": 0.1666,
"step": 11936
},
{
"epoch": 2.08,
"grad_norm": 1.442049503326416,
"learning_rate": 1.2907186148234246e-06,
"loss": 0.1505,
"step": 11968
},
{
"epoch": 2.09,
"grad_norm": 1.654685139656067,
"learning_rate": 1.276558103658057e-06,
"loss": 0.1642,
"step": 12000
},
{
"epoch": 2.1,
"grad_norm": 2.396730422973633,
"learning_rate": 1.262449018400883e-06,
"loss": 0.1641,
"step": 12032
},
{
"epoch": 2.1,
"grad_norm": 1.4008959531784058,
"learning_rate": 1.248391952110327e-06,
"loss": 0.1631,
"step": 12064
},
{
"epoch": 2.11,
"grad_norm": 1.4454419612884521,
"learning_rate": 1.2343874956582586e-06,
"loss": 0.1599,
"step": 12096
},
{
"epoch": 2.11,
"grad_norm": 2.054921865463257,
"learning_rate": 1.2204362377051562e-06,
"loss": 0.1639,
"step": 12128
},
{
"epoch": 2.12,
"grad_norm": 1.5172858238220215,
"learning_rate": 1.2065387646753637e-06,
"loss": 0.159,
"step": 12160
},
{
"epoch": 2.12,
"grad_norm": 1.640461802482605,
"learning_rate": 1.192695660732439e-06,
"loss": 0.1607,
"step": 12192
},
{
"epoch": 2.13,
"grad_norm": 1.2705048322677612,
"learning_rate": 1.1789075077546033e-06,
"loss": 0.1691,
"step": 12224
},
{
"epoch": 2.13,
"grad_norm": 1.6255841255187988,
"learning_rate": 1.1651748853102757e-06,
"loss": 0.1572,
"step": 12256
},
{
"epoch": 2.14,
"grad_norm": 1.617698073387146,
"learning_rate": 1.1514983706337212e-06,
"loss": 0.1664,
"step": 12288
},
{
"epoch": 2.15,
"grad_norm": 2.176952362060547,
"learning_rate": 1.137878538600781e-06,
"loss": 0.1642,
"step": 12320
},
{
"epoch": 2.15,
"grad_norm": 2.0179526805877686,
"learning_rate": 1.1243159617047051e-06,
"loss": 0.1555,
"step": 12352
},
{
"epoch": 2.16,
"grad_norm": 1.6655995845794678,
"learning_rate": 1.1108112100321002e-06,
"loss": 0.1704,
"step": 12384
},
{
"epoch": 2.16,
"grad_norm": 1.541496992111206,
"learning_rate": 1.0973648512389526e-06,
"loss": 0.1611,
"step": 12416
},
{
"epoch": 2.17,
"grad_norm": 1.7085323333740234,
"learning_rate": 1.0839774505267777e-06,
"loss": 0.1663,
"step": 12448
},
{
"epoch": 2.17,
"grad_norm": 1.993090033531189,
"learning_rate": 1.0706495706188584e-06,
"loss": 0.1569,
"step": 12480
},
{
"epoch": 2.18,
"grad_norm": 1.4851964712142944,
"learning_rate": 1.0573817717365914e-06,
"loss": 0.1651,
"step": 12512
},
{
"epoch": 2.18,
"grad_norm": 1.714492917060852,
"learning_rate": 1.0441746115759407e-06,
"loss": 0.1572,
"step": 12544
},
{
"epoch": 2.19,
"grad_norm": 1.7523784637451172,
"learning_rate": 1.031028645283994e-06,
"loss": 0.1736,
"step": 12576
},
{
"epoch": 2.2,
"grad_norm": 1.5587310791015625,
"learning_rate": 1.0179444254356294e-06,
"loss": 0.168,
"step": 12608
},
{
"epoch": 2.2,
"grad_norm": 1.5781267881393433,
"learning_rate": 1.004922502010284e-06,
"loss": 0.1606,
"step": 12640
},
{
"epoch": 2.21,
"grad_norm": 1.6072574853897095,
"learning_rate": 9.919634223688452e-07,
"loss": 0.1614,
"step": 12672
},
{
"epoch": 2.21,
"grad_norm": 1.9626734256744385,
"learning_rate": 9.790677312306346e-07,
"loss": 0.1711,
"step": 12704
},
{
"epoch": 2.22,
"grad_norm": 1.87946355342865,
"learning_rate": 9.662359706505113e-07,
"loss": 0.1652,
"step": 12736
},
{
"epoch": 2.22,
"grad_norm": 2.207897186279297,
"learning_rate": 9.534686799960977e-07,
"loss": 0.1558,
"step": 12768
},
{
"epoch": 2.23,
"grad_norm": 1.8241429328918457,
"learning_rate": 9.407663959250932e-07,
"loss": 0.1572,
"step": 12800
},
{
"epoch": 2.23,
"grad_norm": 1.6716265678405762,
"learning_rate": 9.281296523627276e-07,
"loss": 0.1558,
"step": 12832
},
{
"epoch": 2.24,
"grad_norm": 1.7584782838821411,
"learning_rate": 9.15558980479313e-07,
"loss": 0.1532,
"step": 12864
},
{
"epoch": 2.25,
"grad_norm": 1.6568748950958252,
"learning_rate": 9.030549086679188e-07,
"loss": 0.167,
"step": 12896
},
{
"epoch": 2.25,
"grad_norm": 1.6095608472824097,
"learning_rate": 8.906179625221597e-07,
"loss": 0.1585,
"step": 12928
},
{
"epoch": 2.26,
"grad_norm": 1.4875459671020508,
"learning_rate": 8.782486648141042e-07,
"loss": 0.1626,
"step": 12960
},
{
"epoch": 2.26,
"grad_norm": 1.884362816810608,
"learning_rate": 8.659475354723007e-07,
"loss": 0.1596,
"step": 12992
},
{
"epoch": 2.27,
"grad_norm": 2.296562433242798,
"learning_rate": 8.53715091559919e-07,
"loss": 0.1652,
"step": 13024
},
{
"epoch": 2.27,
"grad_norm": 2.080160140991211,
"learning_rate": 8.415518472530251e-07,
"loss": 0.1535,
"step": 13056
},
{
"epoch": 2.28,
"grad_norm": 1.603214979171753,
"learning_rate": 8.294583138189597e-07,
"loss": 0.1699,
"step": 13088
},
{
"epoch": 2.28,
"grad_norm": 1.718169927597046,
"learning_rate": 8.174349995948483e-07,
"loss": 0.1688,
"step": 13120
},
{
"epoch": 2.29,
"grad_norm": 1.613429307937622,
"learning_rate": 8.054824099662429e-07,
"loss": 0.1643,
"step": 13152
},
{
"epoch": 2.3,
"grad_norm": 1.3368054628372192,
"learning_rate": 7.936010473458653e-07,
"loss": 0.1556,
"step": 13184
},
{
"epoch": 2.3,
"grad_norm": 1.404004454612732,
"learning_rate": 7.817914111524999e-07,
"loss": 0.1588,
"step": 13216
},
{
"epoch": 2.31,
"grad_norm": 1.7263784408569336,
"learning_rate": 7.700539977899962e-07,
"loss": 0.1525,
"step": 13248
},
{
"epoch": 2.31,
"grad_norm": 1.7289042472839355,
"learning_rate": 7.583893006264035e-07,
"loss": 0.1633,
"step": 13280
},
{
"epoch": 2.32,
"grad_norm": 1.5951069593429565,
"learning_rate": 7.467978099732331e-07,
"loss": 0.1539,
"step": 13312
},
{
"epoch": 2.32,
"grad_norm": 2.1001362800598145,
"learning_rate": 7.352800130648494e-07,
"loss": 0.1627,
"step": 13344
},
{
"epoch": 2.33,
"grad_norm": 1.5156986713409424,
"learning_rate": 7.238363940379881e-07,
"loss": 0.156,
"step": 13376
},
{
"epoch": 2.33,
"grad_norm": 1.8548262119293213,
"learning_rate": 7.124674339114071e-07,
"loss": 0.1758,
"step": 13408
},
{
"epoch": 2.34,
"grad_norm": 1.5202828645706177,
"learning_rate": 7.011736105656675e-07,
"loss": 0.1562,
"step": 13440
},
{
"epoch": 2.35,
"grad_norm": 1.8769176006317139,
"learning_rate": 6.89955398723047e-07,
"loss": 0.1598,
"step": 13472
},
{
"epoch": 2.35,
"grad_norm": 1.5129636526107788,
"learning_rate": 6.788132699275813e-07,
"loss": 0.1675,
"step": 13504
},
{
"epoch": 2.36,
"grad_norm": 1.8350058794021606,
"learning_rate": 6.677476925252524e-07,
"loss": 0.1658,
"step": 13536
},
{
"epoch": 2.36,
"grad_norm": 1.9995418787002563,
"learning_rate": 6.567591316442911e-07,
"loss": 0.1722,
"step": 13568
},
{
"epoch": 2.37,
"grad_norm": 1.6814707517623901,
"learning_rate": 6.458480491756347e-07,
"loss": 0.1615,
"step": 13600
},
{
"epoch": 2.37,
"grad_norm": 1.6658669710159302,
"learning_rate": 6.350149037535075e-07,
"loss": 0.1544,
"step": 13632
},
{
"epoch": 2.38,
"grad_norm": 1.5126328468322754,
"learning_rate": 6.242601507361442e-07,
"loss": 0.1577,
"step": 13664
},
{
"epoch": 2.38,
"grad_norm": 1.7326850891113281,
"learning_rate": 6.135842421866486e-07,
"loss": 0.1538,
"step": 13696
},
{
"epoch": 2.39,
"grad_norm": 1.641743779182434,
"learning_rate": 6.029876268539925e-07,
"loss": 0.1612,
"step": 13728
},
{
"epoch": 2.4,
"grad_norm": 1.7537841796875,
"learning_rate": 5.924707501541527e-07,
"loss": 0.1607,
"step": 13760
},
{
"epoch": 2.4,
"grad_norm": 1.8395192623138428,
"learning_rate": 5.820340541513886e-07,
"loss": 0.1621,
"step": 13792
},
{
"epoch": 2.41,
"grad_norm": 1.7613660097122192,
"learning_rate": 5.7167797753966e-07,
"loss": 0.1586,
"step": 13824
},
{
"epoch": 2.41,
"grad_norm": 1.8884879350662231,
"learning_rate": 5.61402955624189e-07,
"loss": 0.1639,
"step": 13856
},
{
"epoch": 2.42,
"grad_norm": 1.3437737226486206,
"learning_rate": 5.512094203031576e-07,
"loss": 0.1537,
"step": 13888
},
{
"epoch": 2.42,
"grad_norm": 1.8541299104690552,
"learning_rate": 5.410978000495621e-07,
"loss": 0.1609,
"step": 13920
},
{
"epoch": 2.43,
"grad_norm": 1.6411800384521484,
"learning_rate": 5.310685198931926e-07,
"loss": 0.1597,
"step": 13952
},
{
"epoch": 2.43,
"grad_norm": 1.4559264183044434,
"learning_rate": 5.211220014027746e-07,
"loss": 0.1563,
"step": 13984
},
{
"epoch": 2.44,
"grad_norm": 1.6533401012420654,
"learning_rate": 5.112586626682467e-07,
"loss": 0.1614,
"step": 14016
},
{
"epoch": 2.45,
"grad_norm": 1.519086241722107,
"learning_rate": 5.014789182831858e-07,
"loss": 0.1649,
"step": 14048
},
{
"epoch": 2.45,
"grad_norm": 1.6548802852630615,
"learning_rate": 4.917831793273814e-07,
"loss": 0.1579,
"step": 14080
},
{
"epoch": 2.46,
"grad_norm": 1.4974662065505981,
"learning_rate": 4.821718533495553e-07,
"loss": 0.1581,
"step": 14112
},
{
"epoch": 2.46,
"grad_norm": 1.5565108060836792,
"learning_rate": 4.7264534435023186e-07,
"loss": 0.1664,
"step": 14144
},
{
"epoch": 2.47,
"grad_norm": 1.8209164142608643,
"learning_rate": 4.6320405276475524e-07,
"loss": 0.1649,
"step": 14176
},
{
"epoch": 2.47,
"grad_norm": 1.8029505014419556,
"learning_rate": 4.5384837544645956e-07,
"loss": 0.1691,
"step": 14208
},
{
"epoch": 2.48,
"grad_norm": 2.0014963150024414,
"learning_rate": 4.445787056499826e-07,
"loss": 0.1588,
"step": 14240
},
{
"epoch": 2.49,
"grad_norm": 1.701003909111023,
"learning_rate": 4.3539543301474446e-07,
"loss": 0.1653,
"step": 14272
},
{
"epoch": 2.49,
"grad_norm": 1.8511948585510254,
"learning_rate": 4.262989435485615e-07,
"loss": 0.1554,
"step": 14304
},
{
"epoch": 2.5,
"grad_norm": 2.2707128524780273,
"learning_rate": 4.172896196114234e-07,
"loss": 0.1555,
"step": 14336
},
{
"epoch": 2.5,
"grad_norm": 1.784833550453186,
"learning_rate": 4.083678398994237e-07,
"loss": 0.1541,
"step": 14368
},
{
"epoch": 2.51,
"grad_norm": 2.091581106185913,
"learning_rate": 3.995339794288383e-07,
"loss": 0.157,
"step": 14400
},
{
"epoch": 2.51,
"grad_norm": 1.475711703300476,
"learning_rate": 3.9078840952036455e-07,
"loss": 0.1652,
"step": 14432
},
{
"epoch": 2.52,
"grad_norm": 2.0263583660125732,
"learning_rate": 3.8213149778351164e-07,
"loss": 0.163,
"step": 14464
},
{
"epoch": 2.52,
"grad_norm": 2.084484100341797,
"learning_rate": 3.73563608101149e-07,
"loss": 0.1589,
"step": 14496
},
{
"epoch": 2.53,
"grad_norm": 1.5741300582885742,
"learning_rate": 3.65085100614212e-07,
"loss": 0.1547,
"step": 14528
},
{
"epoch": 2.54,
"grad_norm": 1.7066149711608887,
"learning_rate": 3.566963317065622e-07,
"loss": 0.1628,
"step": 14560
},
{
"epoch": 2.54,
"grad_norm": 1.4174087047576904,
"learning_rate": 3.483976539900083e-07,
"loss": 0.1589,
"step": 14592
},
{
"epoch": 2.55,
"grad_norm": 1.6838244199752808,
"learning_rate": 3.401894162894828e-07,
"loss": 0.1492,
"step": 14624
},
{
"epoch": 2.55,
"grad_norm": 1.6532583236694336,
"learning_rate": 3.320719636283837e-07,
"loss": 0.1589,
"step": 14656
},
{
"epoch": 2.56,
"grad_norm": 1.447808861732483,
"learning_rate": 3.240456372140674e-07,
"loss": 0.1633,
"step": 14688
},
{
"epoch": 2.56,
"grad_norm": 1.8401908874511719,
"learning_rate": 3.161107744235067e-07,
"loss": 0.1607,
"step": 14720
},
{
"epoch": 2.57,
"grad_norm": 1.8145461082458496,
"learning_rate": 3.082677087891148e-07,
"loss": 0.1528,
"step": 14752
},
{
"epoch": 2.57,
"grad_norm": 2.0109219551086426,
"learning_rate": 3.0051676998471807e-07,
"loss": 0.1611,
"step": 14784
},
{
"epoch": 2.58,
"grad_norm": 1.9445598125457764,
"learning_rate": 2.9285828381170443e-07,
"loss": 0.1545,
"step": 14816
},
{
"epoch": 2.59,
"grad_norm": 1.956513524055481,
"learning_rate": 2.852925721853264e-07,
"loss": 0.1515,
"step": 14848
},
{
"epoch": 2.59,
"grad_norm": 1.5658009052276611,
"learning_rate": 2.7781995312117005e-07,
"loss": 0.1475,
"step": 14880
},
{
"epoch": 2.6,
"grad_norm": 1.5122560262680054,
"learning_rate": 2.704407407217871e-07,
"loss": 0.1485,
"step": 14912
},
{
"epoch": 2.6,
"grad_norm": 1.7659705877304077,
"learning_rate": 2.631552451634931e-07,
"loss": 0.1516,
"step": 14944
},
{
"epoch": 2.61,
"grad_norm": 1.709915041923523,
"learning_rate": 2.5596377268332916e-07,
"loss": 0.1578,
"step": 14976
},
{
"epoch": 2.61,
"grad_norm": 1.7999032735824585,
"learning_rate": 2.488666255661873e-07,
"loss": 0.1593,
"step": 15008
},
{
"epoch": 2.62,
"grad_norm": 1.7251070737838745,
"learning_rate": 2.418641021321097e-07,
"loss": 0.1457,
"step": 15040
},
{
"epoch": 2.62,
"grad_norm": 1.96919584274292,
"learning_rate": 2.3495649672374442e-07,
"loss": 0.158,
"step": 15072
},
{
"epoch": 2.63,
"grad_norm": 1.7707350254058838,
"learning_rate": 2.281440996939724e-07,
"loss": 0.1522,
"step": 15104
},
{
"epoch": 2.64,
"grad_norm": 1.8055440187454224,
"learning_rate": 2.2142719739370876e-07,
"loss": 0.1644,
"step": 15136
},
{
"epoch": 2.64,
"grad_norm": 1.5484322309494019,
"learning_rate": 2.1480607215985938e-07,
"loss": 0.153,
"step": 15168
},
{
"epoch": 2.65,
"grad_norm": 2.3656980991363525,
"learning_rate": 2.0828100230345815e-07,
"loss": 0.1666,
"step": 15200
},
{
"epoch": 2.65,
"grad_norm": 2.2295897006988525,
"learning_rate": 2.018522620979657e-07,
"loss": 0.1598,
"step": 15232
},
{
"epoch": 2.66,
"grad_norm": 1.66213858127594,
"learning_rate": 1.95520121767743e-07,
"loss": 0.1458,
"step": 15264
},
{
"epoch": 2.66,
"grad_norm": 1.7052069902420044,
"learning_rate": 1.8928484747669007e-07,
"loss": 0.1582,
"step": 15296
},
{
"epoch": 2.67,
"grad_norm": 2.455540895462036,
"learning_rate": 1.8314670131706015e-07,
"loss": 0.1626,
"step": 15328
},
{
"epoch": 2.67,
"grad_norm": 1.7177009582519531,
"learning_rate": 1.771059412984427e-07,
"loss": 0.1489,
"step": 15360
},
{
"epoch": 2.68,
"grad_norm": 1.9176840782165527,
"learning_rate": 1.7116282133691624e-07,
"loss": 0.155,
"step": 15392
},
{
"epoch": 2.69,
"grad_norm": 1.6064426898956299,
"learning_rate": 1.6531759124437967e-07,
"loss": 0.1646,
"step": 15424
},
{
"epoch": 2.69,
"grad_norm": 1.764961838722229,
"learning_rate": 1.5957049671804753e-07,
"loss": 0.1553,
"step": 15456
},
{
"epoch": 2.7,
"grad_norm": 1.645261287689209,
"learning_rate": 1.5392177933012258e-07,
"loss": 0.1661,
"step": 15488
},
{
"epoch": 2.7,
"grad_norm": 1.78240168094635,
"learning_rate": 1.4837167651764573e-07,
"loss": 0.1724,
"step": 15520
},
{
"epoch": 2.71,
"grad_norm": 2.0585949420928955,
"learning_rate": 1.4292042157251023e-07,
"loss": 0.1539,
"step": 15552
},
{
"epoch": 2.71,
"grad_norm": 1.6157474517822266,
"learning_rate": 1.3756824363165943e-07,
"loss": 0.1574,
"step": 15584
},
{
"epoch": 2.72,
"grad_norm": 1.6896705627441406,
"learning_rate": 1.3231536766745517e-07,
"loss": 0.1629,
"step": 15616
},
{
"epoch": 2.72,
"grad_norm": 2.022658109664917,
"learning_rate": 1.2716201447821763e-07,
"loss": 0.1529,
"step": 15648
},
{
"epoch": 2.73,
"grad_norm": 1.5828081369400024,
"learning_rate": 1.2210840067894857e-07,
"loss": 0.1633,
"step": 15680
},
{
"epoch": 2.74,
"grad_norm": 1.9208072423934937,
"learning_rate": 1.1715473869222393e-07,
"loss": 0.159,
"step": 15712
},
{
"epoch": 2.74,
"grad_norm": 1.5745195150375366,
"learning_rate": 1.123012367392659e-07,
"loss": 0.1641,
"step": 15744
},
{
"epoch": 2.75,
"grad_norm": 1.5185731649398804,
"learning_rate": 1.0754809883118916e-07,
"loss": 0.1553,
"step": 15776
},
{
"epoch": 2.75,
"grad_norm": 2.097116470336914,
"learning_rate": 1.0289552476042768e-07,
"loss": 0.1502,
"step": 15808
},
{
"epoch": 2.76,
"grad_norm": 1.7868729829788208,
"learning_rate": 9.83437100923354e-08,
"loss": 0.1576,
"step": 15840
},
{
"epoch": 2.76,
"grad_norm": 1.7661504745483398,
"learning_rate": 9.389284615696464e-08,
"loss": 0.1658,
"step": 15872
},
{
"epoch": 2.77,
"grad_norm": 1.692753553390503,
"learning_rate": 8.954312004102711e-08,
"loss": 0.1586,
"step": 15904
},
{
"epoch": 2.77,
"grad_norm": 1.4457049369812012,
"learning_rate": 8.529471458002648e-08,
"loss": 0.1659,
"step": 15936
},
{
"epoch": 2.78,
"grad_norm": 1.9470157623291016,
"learning_rate": 8.114780835057456e-08,
"loss": 0.151,
"step": 15968
},
{
"epoch": 2.79,
"grad_norm": 1.5848197937011719,
"learning_rate": 7.710257566288681e-08,
"loss": 0.1512,
"step": 16000
},
{
"epoch": 2.79,
"grad_norm": 2.0228586196899414,
"learning_rate": 7.315918655345117e-08,
"loss": 0.1622,
"step": 16032
},
{
"epoch": 2.8,
"grad_norm": 1.6379644870758057,
"learning_rate": 6.931780677788546e-08,
"loss": 0.1609,
"step": 16064
},
{
"epoch": 2.8,
"grad_norm": 1.4579676389694214,
"learning_rate": 6.557859780396663e-08,
"loss": 0.1578,
"step": 16096
},
{
"epoch": 2.81,
"grad_norm": 1.8530055284500122,
"learning_rate": 6.194171680484556e-08,
"loss": 0.1566,
"step": 16128
},
{
"epoch": 2.81,
"grad_norm": 2.0496585369110107,
"learning_rate": 5.8407316652438764e-08,
"loss": 0.1613,
"step": 16160
},
{
"epoch": 2.82,
"grad_norm": 1.8340975046157837,
"learning_rate": 5.4975545911005176e-08,
"loss": 0.1598,
"step": 16192
},
{
"epoch": 2.83,
"grad_norm": 2.1443088054656982,
"learning_rate": 5.164654883089926e-08,
"loss": 0.1536,
"step": 16224
},
{
"epoch": 2.83,
"grad_norm": 1.8571709394454956,
"learning_rate": 4.842046534250716e-08,
"loss": 0.1568,
"step": 16256
},
{
"epoch": 2.84,
"grad_norm": 2.0833983421325684,
"learning_rate": 4.529743105036844e-08,
"loss": 0.1702,
"step": 16288
},
{
"epoch": 2.84,
"grad_norm": 1.618950605392456,
"learning_rate": 4.227757722747139e-08,
"loss": 0.1471,
"step": 16320
},
{
"epoch": 2.85,
"grad_norm": 1.9389454126358032,
"learning_rate": 3.9361030809738074e-08,
"loss": 0.1563,
"step": 16352
},
{
"epoch": 2.85,
"grad_norm": 1.731814980506897,
"learning_rate": 3.6547914390688835e-08,
"loss": 0.159,
"step": 16384
},
{
"epoch": 2.86,
"grad_norm": 2.0916991233825684,
"learning_rate": 3.3838346216287785e-08,
"loss": 0.1531,
"step": 16416
},
{
"epoch": 2.86,
"grad_norm": 1.8886840343475342,
"learning_rate": 3.1232440179972954e-08,
"loss": 0.1494,
"step": 16448
},
{
"epoch": 2.87,
"grad_norm": 1.890721321105957,
"learning_rate": 2.8730305817869786e-08,
"loss": 0.1544,
"step": 16480
},
{
"epoch": 2.88,
"grad_norm": 1.5375641584396362,
"learning_rate": 2.6332048304185677e-08,
"loss": 0.1577,
"step": 16512
},
{
"epoch": 2.88,
"grad_norm": 1.7129803895950317,
"learning_rate": 2.40377684467899e-08,
"loss": 0.161,
"step": 16544
},
{
"epoch": 2.89,
"grad_norm": 1.6450425386428833,
"learning_rate": 2.1847562682976166e-08,
"loss": 0.1541,
"step": 16576
},
{
"epoch": 2.89,
"grad_norm": 1.6297754049301147,
"learning_rate": 1.976152307540863e-08,
"loss": 0.1613,
"step": 16608
},
{
"epoch": 2.9,
"grad_norm": 1.9899592399597168,
"learning_rate": 1.777973730825222e-08,
"loss": 0.1499,
"step": 16640
},
{
"epoch": 2.9,
"grad_norm": 1.3760799169540405,
"learning_rate": 1.590228868348781e-08,
"loss": 0.1545,
"step": 16672
},
{
"epoch": 2.91,
"grad_norm": 2.1230156421661377,
"learning_rate": 1.4129256117409451e-08,
"loss": 0.1669,
"step": 16704
},
{
"epoch": 2.91,
"grad_norm": 1.7051235437393188,
"learning_rate": 1.2460714137307594e-08,
"loss": 0.1577,
"step": 16736
},
{
"epoch": 2.92,
"grad_norm": 1.5741279125213623,
"learning_rate": 1.08967328783377e-08,
"loss": 0.1626,
"step": 16768
},
{
"epoch": 2.93,
"grad_norm": 1.71488356590271,
"learning_rate": 9.437378080569825e-09,
"loss": 0.1576,
"step": 16800
},
{
"epoch": 2.93,
"grad_norm": 1.9407358169555664,
"learning_rate": 8.082711086226936e-09,
"loss": 0.1656,
"step": 16832
},
{
"epoch": 2.94,
"grad_norm": 2.023576259613037,
"learning_rate": 6.832788837106974e-09,
"loss": 0.2152,
"step": 16864
},
{
"epoch": 2.94,
"grad_norm": 1.999402403831482,
"learning_rate": 5.687663872187555e-09,
"loss": 0.1555,
"step": 16896
},
{
"epoch": 2.95,
"grad_norm": 1.8923932313919067,
"learning_rate": 4.647384325418835e-09,
"loss": 0.1519,
"step": 16928
},
{
"epoch": 2.95,
"grad_norm": 1.6705876588821411,
"learning_rate": 3.7119939237001412e-09,
"loss": 0.1514,
"step": 16960
},
{
"epoch": 2.96,
"grad_norm": 1.5758410692214966,
"learning_rate": 2.8815319850414303e-09,
"loss": 0.1642,
"step": 16992
},
{
"epoch": 2.96,
"grad_norm": 1.5247572660446167,
"learning_rate": 2.1560334169112852e-09,
"loss": 0.1592,
"step": 17024
},
{
"epoch": 2.97,
"grad_norm": 1.464015007019043,
"learning_rate": 1.5355287147694742e-09,
"loss": 0.1534,
"step": 17056
},
{
"epoch": 2.98,
"grad_norm": 1.6785483360290527,
"learning_rate": 1.020043960784367e-09,
"loss": 0.162,
"step": 17088
},
{
"epoch": 2.98,
"grad_norm": 1.7320752143859863,
"learning_rate": 6.096008227371441e-10,
"loss": 0.1565,
"step": 17120
},
{
"epoch": 2.99,
"grad_norm": 1.4247487783432007,
"learning_rate": 3.042165531116914e-10,
"loss": 0.1616,
"step": 17152
},
{
"epoch": 2.99,
"grad_norm": 1.5723894834518433,
"learning_rate": 1.0390398836851446e-10,
"loss": 0.1567,
"step": 17184
},
{
"epoch": 3.0,
"grad_norm": 1.970982551574707,
"learning_rate": 8.671548404615149e-12,
"loss": 0.1556,
"step": 17216
}
],
"logging_steps": 32,
"max_steps": 17229,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5743,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}