| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 5743, | |
| "global_step": 17229, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 90.67318725585938, | |
| "learning_rate": 2.901915264074289e-09, | |
| "loss": 4.6958, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 104.22982025146484, | |
| "learning_rate": 9.286128845037725e-08, | |
| "loss": 5.0471, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 29.807086944580078, | |
| "learning_rate": 1.857225769007545e-07, | |
| "loss": 4.7906, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 25.591365814208984, | |
| "learning_rate": 2.785838653511318e-07, | |
| "loss": 4.3938, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 25.191415786743164, | |
| "learning_rate": 3.71445153801509e-07, | |
| "loss": 4.2526, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 34.85346984863281, | |
| "learning_rate": 4.643064422518863e-07, | |
| "loss": 4.1321, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 27.427072525024414, | |
| "learning_rate": 5.571677307022636e-07, | |
| "loss": 3.8133, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 21.698957443237305, | |
| "learning_rate": 6.500290191526408e-07, | |
| "loss": 3.8467, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 44.60033416748047, | |
| "learning_rate": 7.42890307603018e-07, | |
| "loss": 3.7419, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 49.22486877441406, | |
| "learning_rate": 8.357515960533953e-07, | |
| "loss": 3.6156, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 36.528568267822266, | |
| "learning_rate": 9.286128845037726e-07, | |
| "loss": 3.3479, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 48.99635696411133, | |
| "learning_rate": 1.0214741729541498e-06, | |
| "loss": 3.0823, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 44.24087142944336, | |
| "learning_rate": 1.1143354614045271e-06, | |
| "loss": 2.7423, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 50.86935806274414, | |
| "learning_rate": 1.2071967498549044e-06, | |
| "loss": 2.4941, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 31.083118438720703, | |
| "learning_rate": 1.3000580383052816e-06, | |
| "loss": 2.2775, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 35.24567413330078, | |
| "learning_rate": 1.3929193267556587e-06, | |
| "loss": 1.9606, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 53.384952545166016, | |
| "learning_rate": 1.485780615206036e-06, | |
| "loss": 1.6271, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 47.57023239135742, | |
| "learning_rate": 1.5786419036564131e-06, | |
| "loss": 1.3614, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 38.1913948059082, | |
| "learning_rate": 1.6715031921067907e-06, | |
| "loss": 1.007, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 24.733055114746094, | |
| "learning_rate": 1.7643644805571678e-06, | |
| "loss": 0.7387, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 20.36598777770996, | |
| "learning_rate": 1.8572257690075451e-06, | |
| "loss": 0.5821, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 9.40074634552002, | |
| "learning_rate": 1.9500870574579222e-06, | |
| "loss": 0.4921, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 9.734894752502441, | |
| "learning_rate": 2.0429483459082996e-06, | |
| "loss": 0.4433, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 8.26183032989502, | |
| "learning_rate": 2.135809634358677e-06, | |
| "loss": 0.4163, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.765627861022949, | |
| "learning_rate": 2.2286709228090542e-06, | |
| "loss": 0.403, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.9424140453338623, | |
| "learning_rate": 2.321532211259431e-06, | |
| "loss": 0.4155, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.6798696517944336, | |
| "learning_rate": 2.414393499709809e-06, | |
| "loss": 0.3943, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.714017868041992, | |
| "learning_rate": 2.5072547881601862e-06, | |
| "loss": 0.3845, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.57197642326355, | |
| "learning_rate": 2.600116076610563e-06, | |
| "loss": 0.3699, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.1965994834899902, | |
| "learning_rate": 2.6929773650609405e-06, | |
| "loss": 0.3559, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.980022668838501, | |
| "learning_rate": 2.7858386535113174e-06, | |
| "loss": 0.347, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 3.079709529876709, | |
| "learning_rate": 2.878699941961695e-06, | |
| "loss": 0.3415, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.8273375034332275, | |
| "learning_rate": 2.971561230412072e-06, | |
| "loss": 0.3531, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.214918375015259, | |
| "learning_rate": 3.0644225188624494e-06, | |
| "loss": 0.3414, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.4280753135681152, | |
| "learning_rate": 3.1572838073128263e-06, | |
| "loss": 0.3391, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.955213189125061, | |
| "learning_rate": 3.250145095763204e-06, | |
| "loss": 0.3372, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.2462759017944336, | |
| "learning_rate": 3.3430063842135814e-06, | |
| "loss": 0.3459, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.0212900638580322, | |
| "learning_rate": 3.4358676726639583e-06, | |
| "loss": 0.3429, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.7310266494750977, | |
| "learning_rate": 3.5287289611143356e-06, | |
| "loss": 0.3347, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.134054183959961, | |
| "learning_rate": 3.6215902495647133e-06, | |
| "loss": 0.3187, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.850546956062317, | |
| "learning_rate": 3.7144515380150902e-06, | |
| "loss": 0.3217, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.3363544940948486, | |
| "learning_rate": 3.8073128264654676e-06, | |
| "loss": 0.3398, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.707647681236267, | |
| "learning_rate": 3.9001741149158445e-06, | |
| "loss": 0.3132, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.7750575542449951, | |
| "learning_rate": 3.993035403366222e-06, | |
| "loss": 0.315, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.8037201166152954, | |
| "learning_rate": 4.085896691816599e-06, | |
| "loss": 0.3263, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.307892084121704, | |
| "learning_rate": 4.1787579802669765e-06, | |
| "loss": 0.3119, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.105405569076538, | |
| "learning_rate": 4.271619268717354e-06, | |
| "loss": 0.3253, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.809043526649475, | |
| "learning_rate": 4.364480557167731e-06, | |
| "loss": 0.3293, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.1056556701660156, | |
| "learning_rate": 4.4573418456181085e-06, | |
| "loss": 0.3091, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.6170997619628906, | |
| "learning_rate": 4.550203134068486e-06, | |
| "loss": 0.3123, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.9799582958221436, | |
| "learning_rate": 4.643064422518862e-06, | |
| "loss": 0.3225, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.9572292566299438, | |
| "learning_rate": 4.7359257109692405e-06, | |
| "loss": 0.3257, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.9340535402297974, | |
| "learning_rate": 4.828786999419618e-06, | |
| "loss": 0.3176, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.9439414739608765, | |
| "learning_rate": 4.921648287869994e-06, | |
| "loss": 0.3114, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.8130865097045898, | |
| "learning_rate": 4.999998717225936e-06, | |
| "loss": 0.3136, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.8154480457305908, | |
| "learning_rate": 4.999929755615174e-06, | |
| "loss": 0.3215, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.8465503454208374, | |
| "learning_rate": 4.999755712464791e-06, | |
| "loss": 0.3178, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.7973068952560425, | |
| "learning_rate": 4.999476595090482e-06, | |
| "loss": 0.3196, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.5011913776397705, | |
| "learning_rate": 4.99909241522461e-06, | |
| "loss": 0.3194, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.8762191534042358, | |
| "learning_rate": 4.998603189015714e-06, | |
| "loss": 0.3238, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.81588876247406, | |
| "learning_rate": 4.9980089370278275e-06, | |
| "loss": 0.328, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.8873347043991089, | |
| "learning_rate": 4.997309684239618e-06, | |
| "loss": 0.3189, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.7700365781784058, | |
| "learning_rate": 4.996505460043337e-06, | |
| "loss": 0.303, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.6417065858840942, | |
| "learning_rate": 4.99559629824358e-06, | |
| "loss": 0.3155, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.689789891242981, | |
| "learning_rate": 4.99458223705587e-06, | |
| "loss": 0.3254, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.12276291847229, | |
| "learning_rate": 4.993463319105047e-06, | |
| "loss": 0.2986, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.7065095901489258, | |
| "learning_rate": 4.992239591423483e-06, | |
| "loss": 0.3197, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.3180906772613525, | |
| "learning_rate": 4.990911105449098e-06, | |
| "loss": 0.3153, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.8196240663528442, | |
| "learning_rate": 4.9894779170232024e-06, | |
| "loss": 0.313, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.0028891563415527, | |
| "learning_rate": 4.987940086388146e-06, | |
| "loss": 0.31, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.862817645072937, | |
| "learning_rate": 4.986297678184791e-06, | |
| "loss": 0.3035, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.7188054323196411, | |
| "learning_rate": 4.984550761449788e-06, | |
| "loss": 0.3504, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.6241610050201416, | |
| "learning_rate": 4.982699409612683e-06, | |
| "loss": 0.2926, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.9147684574127197, | |
| "learning_rate": 4.980743700492822e-06, | |
| "loss": 0.3212, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.5901225805282593, | |
| "learning_rate": 4.978683716296084e-06, | |
| "loss": 0.3057, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.670644760131836, | |
| "learning_rate": 4.976519543611427e-06, | |
| "loss": 0.3038, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.3500678539276123, | |
| "learning_rate": 4.974251273407246e-06, | |
| "loss": 0.3019, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.6647801399230957, | |
| "learning_rate": 4.971879001027552e-06, | |
| "loss": 0.3017, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.793252944946289, | |
| "learning_rate": 4.9694028261879576e-06, | |
| "loss": 0.3068, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.9570722579956055, | |
| "learning_rate": 4.966822852971493e-06, | |
| "loss": 0.311, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.787014365196228, | |
| "learning_rate": 4.964139189824232e-06, | |
| "loss": 0.2992, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.68462073802948, | |
| "learning_rate": 4.961351949550722e-06, | |
| "loss": 0.3054, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.7559980154037476, | |
| "learning_rate": 4.958461249309258e-06, | |
| "loss": 0.3119, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.9941719770431519, | |
| "learning_rate": 4.955467210606944e-06, | |
| "loss": 0.3122, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.6227741241455078, | |
| "learning_rate": 4.9523699592945966e-06, | |
| "loss": 0.3094, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.8108834028244019, | |
| "learning_rate": 4.9491696255614475e-06, | |
| "loss": 0.3079, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.871539831161499, | |
| "learning_rate": 4.945866343929675e-06, | |
| "loss": 0.3085, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.8067561388015747, | |
| "learning_rate": 4.94246025324875e-06, | |
| "loss": 0.3138, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.4682188034057617, | |
| "learning_rate": 4.938951496689593e-06, | |
| "loss": 0.3141, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.8749171495437622, | |
| "learning_rate": 4.935340221738568e-06, | |
| "loss": 0.2903, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.780875563621521, | |
| "learning_rate": 4.931626580191271e-06, | |
| "loss": 0.3086, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.468619465827942, | |
| "learning_rate": 4.927810728146158e-06, | |
| "loss": 0.3016, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.5029264688491821, | |
| "learning_rate": 4.923892825997976e-06, | |
| "loss": 0.2962, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.9954643249511719, | |
| "learning_rate": 4.919873038431031e-06, | |
| "loss": 0.3135, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.7070242166519165, | |
| "learning_rate": 4.915751534412256e-06, | |
| "loss": 0.3216, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.9191008806228638, | |
| "learning_rate": 4.911528487184115e-06, | |
| "loss": 0.2814, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.5027116537094116, | |
| "learning_rate": 4.9072040742573154e-06, | |
| "loss": 0.2917, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.5248404741287231, | |
| "learning_rate": 4.902778477403354e-06, | |
| "loss": 0.3001, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.996079921722412, | |
| "learning_rate": 4.89825188264687e-06, | |
| "loss": 0.2951, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.7733569145202637, | |
| "learning_rate": 4.893624480257826e-06, | |
| "loss": 0.3031, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.3993550539016724, | |
| "learning_rate": 4.888896464743515e-06, | |
| "loss": 0.2941, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.37386953830719, | |
| "learning_rate": 4.884068034840382e-06, | |
| "loss": 0.2917, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.7720749378204346, | |
| "learning_rate": 4.879139393505669e-06, | |
| "loss": 0.3038, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.4496855735778809, | |
| "learning_rate": 4.874110747908883e-06, | |
| "loss": 0.3019, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.70554518699646, | |
| "learning_rate": 4.868982309423094e-06, | |
| "loss": 0.2824, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.6426483392715454, | |
| "learning_rate": 4.863754293616043e-06, | |
| "loss": 0.3064, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.9177954196929932, | |
| "learning_rate": 4.858426920241083e-06, | |
| "loss": 0.3046, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.8572829961776733, | |
| "learning_rate": 4.853000413227946e-06, | |
| "loss": 0.3032, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.548661708831787, | |
| "learning_rate": 4.8474750006733265e-06, | |
| "loss": 0.2997, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.4153339862823486, | |
| "learning_rate": 4.841850914831291e-06, | |
| "loss": 0.2939, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.7126343250274658, | |
| "learning_rate": 4.836128392103524e-06, | |
| "loss": 0.3181, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.8360246419906616, | |
| "learning_rate": 4.830307673029383e-06, | |
| "loss": 0.3026, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.6076633930206299, | |
| "learning_rate": 4.82438900227579e-06, | |
| "loss": 0.3092, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.890261173248291, | |
| "learning_rate": 4.8183726286269515e-06, | |
| "loss": 0.2965, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.462218999862671, | |
| "learning_rate": 4.812258804973895e-06, | |
| "loss": 0.2851, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.5263981819152832, | |
| "learning_rate": 4.806047788303841e-06, | |
| "loss": 0.3002, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.9360984563827515, | |
| "learning_rate": 4.799739839689404e-06, | |
| "loss": 0.2918, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.7371431589126587, | |
| "learning_rate": 4.7933352242776136e-06, | |
| "loss": 0.2962, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.1363890171051025, | |
| "learning_rate": 4.786834211278775e-06, | |
| "loss": 0.2844, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.2848248481750488, | |
| "learning_rate": 4.780237073955147e-06, | |
| "loss": 0.2927, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.685262680053711, | |
| "learning_rate": 4.773544089609462e-06, | |
| "loss": 0.3018, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.3517770767211914, | |
| "learning_rate": 4.766755539573261e-06, | |
| "loss": 0.2936, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.723820447921753, | |
| "learning_rate": 4.759871709195081e-06, | |
| "loss": 0.2974, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.8562276363372803, | |
| "learning_rate": 4.752892887828448e-06, | |
| "loss": 0.2967, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.9959107637405396, | |
| "learning_rate": 4.745819368819723e-06, | |
| "loss": 0.2929, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.034578323364258, | |
| "learning_rate": 4.738651449495767e-06, | |
| "loss": 0.2978, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.2981053590774536, | |
| "learning_rate": 4.731389431151445e-06, | |
| "loss": 0.2951, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.5861812829971313, | |
| "learning_rate": 4.72403361903696e-06, | |
| "loss": 0.2896, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.2682240009307861, | |
| "learning_rate": 4.716584322345028e-06, | |
| "loss": 0.2973, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.147078275680542, | |
| "learning_rate": 4.70904185419787e-06, | |
| "loss": 0.2944, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.5924174785614014, | |
| "learning_rate": 4.7014065316340606e-06, | |
| "loss": 0.2859, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.7698993682861328, | |
| "learning_rate": 4.693678675595199e-06, | |
| "loss": 0.2943, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.599900245666504, | |
| "learning_rate": 4.685858610912416e-06, | |
| "loss": 0.3045, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.6287124156951904, | |
| "learning_rate": 4.677946666292722e-06, | |
| "loss": 0.2884, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.712890386581421, | |
| "learning_rate": 4.66994317430519e-06, | |
| "loss": 0.2946, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.7418298721313477, | |
| "learning_rate": 4.661848471366977e-06, | |
| "loss": 0.2933, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.7066587209701538, | |
| "learning_rate": 4.653662897729183e-06, | |
| "loss": 0.2934, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.9766113758087158, | |
| "learning_rate": 4.645386797462547e-06, | |
| "loss": 0.3055, | |
| "step": 4384 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.7087821960449219, | |
| "learning_rate": 4.637020518442986e-06, | |
| "loss": 0.2857, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.085517168045044, | |
| "learning_rate": 4.628564412336975e-06, | |
| "loss": 0.3509, | |
| "step": 4448 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.586540699005127, | |
| "learning_rate": 4.620018834586759e-06, | |
| "loss": 0.3071, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.5229309797286987, | |
| "learning_rate": 4.611384144395419e-06, | |
| "loss": 0.2978, | |
| "step": 4512 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.5610827207565308, | |
| "learning_rate": 4.602660704711768e-06, | |
| "loss": 0.3, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.3079458475112915, | |
| "learning_rate": 4.593848882215098e-06, | |
| "loss": 0.3023, | |
| "step": 4576 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.243058204650879, | |
| "learning_rate": 4.584949047299766e-06, | |
| "loss": 0.282, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.5151655673980713, | |
| "learning_rate": 4.5759615740596265e-06, | |
| "loss": 0.2863, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.6148176193237305, | |
| "learning_rate": 4.5668868402723024e-06, | |
| "loss": 0.29, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.3599457740783691, | |
| "learning_rate": 4.557725227383313e-06, | |
| "loss": 0.292, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.7854079008102417, | |
| "learning_rate": 4.548477120490031e-06, | |
| "loss": 0.3065, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.7878711223602295, | |
| "learning_rate": 4.539142908325506e-06, | |
| "loss": 0.2986, | |
| "step": 4768 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.9896265268325806, | |
| "learning_rate": 4.529722983242114e-06, | |
| "loss": 0.3025, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.5168676376342773, | |
| "learning_rate": 4.5202177411950745e-06, | |
| "loss": 0.2847, | |
| "step": 4832 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.6856423616409302, | |
| "learning_rate": 4.5106275817258e-06, | |
| "loss": 0.3038, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.329127550125122, | |
| "learning_rate": 4.5009529079451085e-06, | |
| "loss": 0.2829, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.9718722105026245, | |
| "learning_rate": 4.4911941265162695e-06, | |
| "loss": 0.2976, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.4611300230026245, | |
| "learning_rate": 4.481351647637921e-06, | |
| "loss": 0.2829, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.7366551160812378, | |
| "learning_rate": 4.471425885026822e-06, | |
| "loss": 0.2916, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.381406307220459, | |
| "learning_rate": 4.46141725590046e-06, | |
| "loss": 0.2908, | |
| "step": 5024 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.4315983057022095, | |
| "learning_rate": 4.451326180959521e-06, | |
| "loss": 0.2705, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.7121741771697998, | |
| "learning_rate": 4.4411530843702e-06, | |
| "loss": 0.2956, | |
| "step": 5088 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.845123529434204, | |
| "learning_rate": 4.430898393746371e-06, | |
| "loss": 0.2876, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.3543351888656616, | |
| "learning_rate": 4.420562540131618e-06, | |
| "loss": 0.2962, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.028374195098877, | |
| "learning_rate": 4.410145957981112e-06, | |
| "loss": 0.2937, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.5300021171569824, | |
| "learning_rate": 4.399649085143354e-06, | |
| "loss": 0.2883, | |
| "step": 5216 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.8156017065048218, | |
| "learning_rate": 4.3890723628417605e-06, | |
| "loss": 0.2903, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.8335083723068237, | |
| "learning_rate": 4.378416235656133e-06, | |
| "loss": 0.2964, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.6856135129928589, | |
| "learning_rate": 4.3676811515039554e-06, | |
| "loss": 0.284, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.8075724840164185, | |
| "learning_rate": 4.356867561621575e-06, | |
| "loss": 0.274, | |
| "step": 5344 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.1280436515808105, | |
| "learning_rate": 4.345975920545232e-06, | |
| "loss": 0.2781, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.8782936334609985, | |
| "learning_rate": 4.335006686091956e-06, | |
| "loss": 0.2796, | |
| "step": 5408 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.802985429763794, | |
| "learning_rate": 4.323960319340321e-06, | |
| "loss": 0.2795, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.6335434913635254, | |
| "learning_rate": 4.312837284611062e-06, | |
| "loss": 0.3002, | |
| "step": 5472 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.5709927082061768, | |
| "learning_rate": 4.301638049447563e-06, | |
| "loss": 0.2912, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.890569806098938, | |
| "learning_rate": 4.290363084596199e-06, | |
| "loss": 0.2995, | |
| "step": 5536 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.405389666557312, | |
| "learning_rate": 4.279012863986554e-06, | |
| "loss": 0.2818, | |
| "step": 5568 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.4392948150634766, | |
| "learning_rate": 4.267587864711496e-06, | |
| "loss": 0.2944, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.3795379400253296, | |
| "learning_rate": 4.256088567007123e-06, | |
| "loss": 0.2754, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.4689635038375854, | |
| "learning_rate": 4.244515454232579e-06, | |
| "loss": 0.2935, | |
| "step": 5664 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.6686134338378906, | |
| "learning_rate": 4.232869012849739e-06, | |
| "loss": 0.2945, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3574343919754028, | |
| "learning_rate": 4.22114973240275e-06, | |
| "loss": 0.2899, | |
| "step": 5728 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.4560518264770508, | |
| "learning_rate": 4.20935810549747e-06, | |
| "loss": 0.2737, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.5906906127929688, | |
| "learning_rate": 4.1974946277807485e-06, | |
| "loss": 0.2523, | |
| "step": 5792 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 2.1945157051086426, | |
| "learning_rate": 4.185559797919597e-06, | |
| "loss": 0.2449, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.203098773956299, | |
| "learning_rate": 4.173554117580231e-06, | |
| "loss": 0.2434, | |
| "step": 5856 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 2.0016815662384033, | |
| "learning_rate": 4.16147809140698e-06, | |
| "loss": 0.2408, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.8361921310424805, | |
| "learning_rate": 4.149332227001075e-06, | |
| "loss": 0.2466, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.681320071220398, | |
| "learning_rate": 4.137117034899314e-06, | |
| "loss": 0.2437, | |
| "step": 5952 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.212421178817749, | |
| "learning_rate": 4.124833028552601e-06, | |
| "loss": 0.2488, | |
| "step": 5984 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 3.8892650604248047, | |
| "learning_rate": 4.112480724304362e-06, | |
| "loss": 0.2382, | |
| "step": 6016 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.904011607170105, | |
| "learning_rate": 4.100060641368848e-06, | |
| "loss": 0.2461, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.6115171909332275, | |
| "learning_rate": 4.087573301809301e-06, | |
| "loss": 0.2475, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.625502109527588, | |
| "learning_rate": 4.075019230516016e-06, | |
| "loss": 0.2535, | |
| "step": 6112 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.7666929960250854, | |
| "learning_rate": 4.062398955184277e-06, | |
| "loss": 0.2471, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.4440840482711792, | |
| "learning_rate": 4.049713006292174e-06, | |
| "loss": 0.2346, | |
| "step": 6176 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.6042975187301636, | |
| "learning_rate": 4.036961917078305e-06, | |
| "loss": 0.2452, | |
| "step": 6208 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.5292000770568848, | |
| "learning_rate": 4.024146223519365e-06, | |
| "loss": 0.2303, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.6863032579421997, | |
| "learning_rate": 4.011266464307615e-06, | |
| "loss": 0.2454, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.8112516403198242, | |
| "learning_rate": 3.998323180828236e-06, | |
| "loss": 0.2335, | |
| "step": 6304 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.5118499994277954, | |
| "learning_rate": 3.985316917136579e-06, | |
| "loss": 0.2534, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.620071291923523, | |
| "learning_rate": 3.972248219935289e-06, | |
| "loss": 0.2311, | |
| "step": 6368 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.4030137062072754, | |
| "learning_rate": 3.959117638551331e-06, | |
| "loss": 0.2411, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.6774815320968628, | |
| "learning_rate": 3.945925724912896e-06, | |
| "loss": 0.2546, | |
| "step": 6432 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.215942144393921, | |
| "learning_rate": 3.932673033526203e-06, | |
| "loss": 0.2453, | |
| "step": 6464 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.5397557020187378, | |
| "learning_rate": 3.919360121452188e-06, | |
| "loss": 0.2455, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.6343345642089844, | |
| "learning_rate": 3.905987548283097e-06, | |
| "loss": 0.2345, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.7040832042694092, | |
| "learning_rate": 3.892555876118951e-06, | |
| "loss": 0.2456, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.6422055959701538, | |
| "learning_rate": 3.879065669543931e-06, | |
| "loss": 0.2464, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.47724187374115, | |
| "learning_rate": 3.865517495602642e-06, | |
| "loss": 0.2379, | |
| "step": 6624 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.5706995725631714, | |
| "learning_rate": 3.851911923776274e-06, | |
| "loss": 0.2542, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.3481061458587646, | |
| "learning_rate": 3.83824952595867e-06, | |
| "loss": 0.2358, | |
| "step": 6688 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.7855818271636963, | |
| "learning_rate": 3.824530876432287e-06, | |
| "loss": 0.2506, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.661627173423767, | |
| "learning_rate": 3.81075655184405e-06, | |
| "loss": 0.2477, | |
| "step": 6752 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.3250243663787842, | |
| "learning_rate": 3.796927131181124e-06, | |
| "loss": 0.236, | |
| "step": 6784 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.5946906805038452, | |
| "learning_rate": 3.7830431957465673e-06, | |
| "loss": 0.2405, | |
| "step": 6816 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.4719637632369995, | |
| "learning_rate": 3.7691053291349012e-06, | |
| "loss": 0.2444, | |
| "step": 6848 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.088393211364746, | |
| "learning_rate": 3.755114117207582e-06, | |
| "loss": 0.238, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.853722333908081, | |
| "learning_rate": 3.7410701480683693e-06, | |
| "loss": 0.2496, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.338397741317749, | |
| "learning_rate": 3.726974012038609e-06, | |
| "loss": 0.2482, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.6698582172393799, | |
| "learning_rate": 3.7128263016324205e-06, | |
| "loss": 0.2501, | |
| "step": 6976 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 2.1817092895507812, | |
| "learning_rate": 3.698627611531791e-06, | |
| "loss": 0.2382, | |
| "step": 7008 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.6343122720718384, | |
| "learning_rate": 3.684378538561575e-06, | |
| "loss": 0.2447, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.335663080215454, | |
| "learning_rate": 3.6700796816644115e-06, | |
| "loss": 0.245, | |
| "step": 7072 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.7304885387420654, | |
| "learning_rate": 3.655731641875549e-06, | |
| "loss": 0.2541, | |
| "step": 7104 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.6443067789077759, | |
| "learning_rate": 3.641335022297576e-06, | |
| "loss": 0.249, | |
| "step": 7136 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.5171420574188232, | |
| "learning_rate": 3.626890428075077e-06, | |
| "loss": 0.248, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.6931904554367065, | |
| "learning_rate": 3.6123984663691925e-06, | |
| "loss": 0.2404, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.7795478105545044, | |
| "learning_rate": 3.5978597463320964e-06, | |
| "loss": 0.2462, | |
| "step": 7232 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.7948668003082275, | |
| "learning_rate": 3.5832748790813945e-06, | |
| "loss": 0.2507, | |
| "step": 7264 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.6601665019989014, | |
| "learning_rate": 3.5686444776744346e-06, | |
| "loss": 0.2463, | |
| "step": 7296 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.675183892250061, | |
| "learning_rate": 3.5539691570825374e-06, | |
| "loss": 0.2514, | |
| "step": 7328 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.5422120094299316, | |
| "learning_rate": 3.5392495341651497e-06, | |
| "loss": 0.247, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.522061824798584, | |
| "learning_rate": 3.5244862276439102e-06, | |
| "loss": 0.2455, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.5597782135009766, | |
| "learning_rate": 3.5096798580766476e-06, | |
| "loss": 0.2514, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.3420979976654053, | |
| "learning_rate": 3.494831047831293e-06, | |
| "loss": 0.2486, | |
| "step": 7456 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.397116780281067, | |
| "learning_rate": 3.479940421059721e-06, | |
| "loss": 0.2388, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.9889482259750366, | |
| "learning_rate": 3.4650086036715123e-06, | |
| "loss": 0.2521, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.3389945030212402, | |
| "learning_rate": 3.450036223307647e-06, | |
| "loss": 0.254, | |
| "step": 7552 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.550194263458252, | |
| "learning_rate": 3.435023909314119e-06, | |
| "loss": 0.2411, | |
| "step": 7584 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.39427649974823, | |
| "learning_rate": 3.4199722927154876e-06, | |
| "loss": 0.2421, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.647886037826538, | |
| "learning_rate": 3.4048820061883475e-06, | |
| "loss": 0.2546, | |
| "step": 7648 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.4851469993591309, | |
| "learning_rate": 3.3897536840347384e-06, | |
| "loss": 0.2543, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.5985283851623535, | |
| "learning_rate": 3.3745879621554793e-06, | |
| "loss": 0.2505, | |
| "step": 7712 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.467861294746399, | |
| "learning_rate": 3.3593854780234446e-06, | |
| "loss": 0.2408, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.5086297988891602, | |
| "learning_rate": 3.3441468706567655e-06, | |
| "loss": 0.2492, | |
| "step": 7776 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.6549632549285889, | |
| "learning_rate": 3.328872780591968e-06, | |
| "loss": 0.2431, | |
| "step": 7808 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.413674235343933, | |
| "learning_rate": 3.313563849857052e-06, | |
| "loss": 0.2539, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.547775387763977, | |
| "learning_rate": 3.298220721944504e-06, | |
| "loss": 0.2376, | |
| "step": 7872 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.834301471710205, | |
| "learning_rate": 3.282844041784245e-06, | |
| "loss": 0.2465, | |
| "step": 7904 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.5492215156555176, | |
| "learning_rate": 3.2674344557165268e-06, | |
| "loss": 0.2462, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.691975474357605, | |
| "learning_rate": 3.2519926114647597e-06, | |
| "loss": 0.2423, | |
| "step": 7968 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.4883307218551636, | |
| "learning_rate": 3.2365191581082894e-06, | |
| "loss": 0.2431, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.9474862813949585, | |
| "learning_rate": 3.221014746055112e-06, | |
| "loss": 0.2409, | |
| "step": 8032 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.7408241033554077, | |
| "learning_rate": 3.205480027014535e-06, | |
| "loss": 0.2529, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.9496859312057495, | |
| "learning_rate": 3.1899156539697817e-06, | |
| "loss": 0.244, | |
| "step": 8096 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.5115084648132324, | |
| "learning_rate": 3.174322281150549e-06, | |
| "loss": 0.2355, | |
| "step": 8128 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.5742942094802856, | |
| "learning_rate": 3.1587005640055035e-06, | |
| "loss": 0.237, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.6793855428695679, | |
| "learning_rate": 3.14305115917473e-06, | |
| "loss": 0.2357, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.359031081199646, | |
| "learning_rate": 3.1273747244621333e-06, | |
| "loss": 0.2507, | |
| "step": 8224 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.3813912868499756, | |
| "learning_rate": 3.1116719188077867e-06, | |
| "loss": 0.2455, | |
| "step": 8256 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.2507054805755615, | |
| "learning_rate": 3.0959434022602326e-06, | |
| "loss": 0.2376, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.6926429271697998, | |
| "learning_rate": 3.080189835948742e-06, | |
| "loss": 0.243, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.7547937631607056, | |
| "learning_rate": 3.0644118820555217e-06, | |
| "loss": 0.2418, | |
| "step": 8352 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.5827124118804932, | |
| "learning_rate": 3.048610203787881e-06, | |
| "loss": 0.2422, | |
| "step": 8384 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.5525377988815308, | |
| "learning_rate": 3.0327854653503554e-06, | |
| "loss": 0.2382, | |
| "step": 8416 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.502051830291748, | |
| "learning_rate": 3.0169383319167866e-06, | |
| "loss": 0.2343, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.5819698572158813, | |
| "learning_rate": 3.001069469602361e-06, | |
| "loss": 0.2307, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.678463339805603, | |
| "learning_rate": 2.9851795454356164e-06, | |
| "loss": 0.2395, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.5477441549301147, | |
| "learning_rate": 2.969269227330397e-06, | |
| "loss": 0.2458, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.6442259550094604, | |
| "learning_rate": 2.953339184057783e-06, | |
| "loss": 0.2454, | |
| "step": 8576 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.751511573791504, | |
| "learning_rate": 2.9373900852179784e-06, | |
| "loss": 0.2286, | |
| "step": 8608 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.5601673126220703, | |
| "learning_rate": 2.9214226012121638e-06, | |
| "loss": 0.2389, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.3844302892684937, | |
| "learning_rate": 2.905437403214319e-06, | |
| "loss": 0.2454, | |
| "step": 8672 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.2684638500213623, | |
| "learning_rate": 2.88943516314301e-06, | |
| "loss": 0.2376, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.507758617401123, | |
| "learning_rate": 2.873416553633147e-06, | |
| "loss": 0.2269, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.6755341291427612, | |
| "learning_rate": 2.857382248007708e-06, | |
| "loss": 0.2236, | |
| "step": 8768 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.9231692552566528, | |
| "learning_rate": 2.8413329202494396e-06, | |
| "loss": 0.2541, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.528493046760559, | |
| "learning_rate": 2.825269244972525e-06, | |
| "loss": 0.2271, | |
| "step": 8832 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.3824098110198975, | |
| "learning_rate": 2.8091918973942276e-06, | |
| "loss": 0.2451, | |
| "step": 8864 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.4477118253707886, | |
| "learning_rate": 2.7931015533065116e-06, | |
| "loss": 0.2382, | |
| "step": 8896 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.9449779987335205, | |
| "learning_rate": 2.776998889047631e-06, | |
| "loss": 0.2543, | |
| "step": 8928 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.2107715606689453, | |
| "learning_rate": 2.760884581473706e-06, | |
| "loss": 0.2388, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.4030863046646118, | |
| "learning_rate": 2.744759307930268e-06, | |
| "loss": 0.24, | |
| "step": 8992 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.8753631114959717, | |
| "learning_rate": 2.7286237462237907e-06, | |
| "loss": 0.2401, | |
| "step": 9024 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.5679469108581543, | |
| "learning_rate": 2.7124785745931974e-06, | |
| "loss": 0.2519, | |
| "step": 9056 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.4486414194107056, | |
| "learning_rate": 2.696324471681353e-06, | |
| "loss": 0.2457, | |
| "step": 9088 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.8142122030258179, | |
| "learning_rate": 2.6801621165065384e-06, | |
| "loss": 0.2385, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.4930641651153564, | |
| "learning_rate": 2.6639921884339094e-06, | |
| "loss": 0.2411, | |
| "step": 9152 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.5598373413085938, | |
| "learning_rate": 2.647815367146937e-06, | |
| "loss": 0.2265, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.7909144163131714, | |
| "learning_rate": 2.631632332618844e-06, | |
| "loss": 0.2502, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.6813157796859741, | |
| "learning_rate": 2.6154437650840153e-06, | |
| "loss": 0.2368, | |
| "step": 9248 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.4486907720565796, | |
| "learning_rate": 2.599250345009411e-06, | |
| "loss": 0.2423, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.7712582349777222, | |
| "learning_rate": 2.583052753065962e-06, | |
| "loss": 0.2312, | |
| "step": 9312 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.3970569372177124, | |
| "learning_rate": 2.5668516700999585e-06, | |
| "loss": 0.2415, | |
| "step": 9344 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.3571640253067017, | |
| "learning_rate": 2.5506477771044313e-06, | |
| "loss": 0.2254, | |
| "step": 9376 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.6543464660644531, | |
| "learning_rate": 2.5344417551905276e-06, | |
| "loss": 0.2358, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.0646538734436035, | |
| "learning_rate": 2.518234285558882e-06, | |
| "loss": 0.2433, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.5231086015701294, | |
| "learning_rate": 2.50202604947098e-06, | |
| "loss": 0.2289, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.7834982872009277, | |
| "learning_rate": 2.485817728220526e-06, | |
| "loss": 0.2366, | |
| "step": 9504 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.4642645120620728, | |
| "learning_rate": 2.469610003104804e-06, | |
| "loss": 0.2354, | |
| "step": 9536 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.4511359930038452, | |
| "learning_rate": 2.453403555396038e-06, | |
| "loss": 0.2394, | |
| "step": 9568 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.9833004474639893, | |
| "learning_rate": 2.4371990663127613e-06, | |
| "loss": 0.2453, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 2.215998888015747, | |
| "learning_rate": 2.420997216991178e-06, | |
| "loss": 0.2345, | |
| "step": 9632 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.5565507411956787, | |
| "learning_rate": 2.404798688456529e-06, | |
| "loss": 0.2465, | |
| "step": 9664 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.583949327468872, | |
| "learning_rate": 2.3886041615944753e-06, | |
| "loss": 0.2442, | |
| "step": 9696 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.6056400537490845, | |
| "learning_rate": 2.3724143171224684e-06, | |
| "loss": 0.2346, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.5098297595977783, | |
| "learning_rate": 2.3562298355611444e-06, | |
| "loss": 0.2444, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.4785221815109253, | |
| "learning_rate": 2.3400513972057117e-06, | |
| "loss": 0.245, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.4235949516296387, | |
| "learning_rate": 2.323879682097365e-06, | |
| "loss": 0.2386, | |
| "step": 9824 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.4386334419250488, | |
| "learning_rate": 2.3077153699946912e-06, | |
| "loss": 0.2303, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.6986160278320312, | |
| "learning_rate": 2.291559140345102e-06, | |
| "loss": 0.2329, | |
| "step": 9888 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.5963070392608643, | |
| "learning_rate": 2.2754116722562756e-06, | |
| "loss": 0.2352, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.5542904138565063, | |
| "learning_rate": 2.2592736444676035e-06, | |
| "loss": 0.2378, | |
| "step": 9952 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.3971989154815674, | |
| "learning_rate": 2.243145735321669e-06, | |
| "loss": 0.2347, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.5635007619857788, | |
| "learning_rate": 2.2270286227357306e-06, | |
| "loss": 0.2406, | |
| "step": 10016 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.545464038848877, | |
| "learning_rate": 2.210922984173223e-06, | |
| "loss": 0.2329, | |
| "step": 10048 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.3175172805786133, | |
| "learning_rate": 2.19482949661529e-06, | |
| "loss": 0.2448, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.4598995447158813, | |
| "learning_rate": 2.1787488365323163e-06, | |
| "loss": 0.2294, | |
| "step": 10112 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.50557279586792, | |
| "learning_rate": 2.1626816798555035e-06, | |
| "loss": 0.2427, | |
| "step": 10144 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.596571683883667, | |
| "learning_rate": 2.14662870194845e-06, | |
| "loss": 0.2187, | |
| "step": 10176 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.5053282976150513, | |
| "learning_rate": 2.1305905775787713e-06, | |
| "loss": 0.2394, | |
| "step": 10208 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.6713193655014038, | |
| "learning_rate": 2.1145679808897297e-06, | |
| "loss": 0.2333, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.394679069519043, | |
| "learning_rate": 2.098561585371898e-06, | |
| "loss": 0.236, | |
| "step": 10272 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.6781407594680786, | |
| "learning_rate": 2.082572063834857e-06, | |
| "loss": 0.2461, | |
| "step": 10304 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.6299934387207031, | |
| "learning_rate": 2.066600088378906e-06, | |
| "loss": 0.2464, | |
| "step": 10336 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.309910774230957, | |
| "learning_rate": 2.0506463303668182e-06, | |
| "loss": 0.2395, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.5353403091430664, | |
| "learning_rate": 2.0347114603956184e-06, | |
| "loss": 0.2396, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.5149140357971191, | |
| "learning_rate": 2.018796148268393e-06, | |
| "loss": 0.2267, | |
| "step": 10432 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.8665707111358643, | |
| "learning_rate": 2.002901062966141e-06, | |
| "loss": 0.2343, | |
| "step": 10464 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.7906211614608765, | |
| "learning_rate": 1.9870268726196493e-06, | |
| "loss": 0.2332, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.6433167457580566, | |
| "learning_rate": 1.971174244481411e-06, | |
| "loss": 0.2387, | |
| "step": 10528 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.7116189002990723, | |
| "learning_rate": 1.9553438448975766e-06, | |
| "loss": 0.244, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.4329649209976196, | |
| "learning_rate": 1.9395363392799486e-06, | |
| "loss": 0.2301, | |
| "step": 10592 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.4449670314788818, | |
| "learning_rate": 1.9237523920780077e-06, | |
| "loss": 0.2258, | |
| "step": 10624 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.8179880380630493, | |
| "learning_rate": 1.9079926667509833e-06, | |
| "loss": 0.2353, | |
| "step": 10656 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.6381878852844238, | |
| "learning_rate": 1.892257825739971e-06, | |
| "loss": 0.2263, | |
| "step": 10688 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.6362228393554688, | |
| "learning_rate": 1.8765485304400804e-06, | |
| "loss": 0.2312, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.6400152444839478, | |
| "learning_rate": 1.8608654411726407e-06, | |
| "loss": 0.2313, | |
| "step": 10752 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.1943421363830566, | |
| "learning_rate": 1.8452092171574418e-06, | |
| "loss": 0.2239, | |
| "step": 10784 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.8176000118255615, | |
| "learning_rate": 1.8295805164850217e-06, | |
| "loss": 0.231, | |
| "step": 10816 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.5386502742767334, | |
| "learning_rate": 1.8139799960890132e-06, | |
| "loss": 0.233, | |
| "step": 10848 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.666212558746338, | |
| "learning_rate": 1.79840831171852e-06, | |
| "loss": 0.2387, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.0908565521240234, | |
| "learning_rate": 1.7828661179105618e-06, | |
| "loss": 0.2301, | |
| "step": 10912 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.824118733406067, | |
| "learning_rate": 1.767354067962555e-06, | |
| "loss": 0.2211, | |
| "step": 10944 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.6803866624832153, | |
| "learning_rate": 1.7518728139048585e-06, | |
| "loss": 0.2283, | |
| "step": 10976 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.5006029605865479, | |
| "learning_rate": 1.7364230064733606e-06, | |
| "loss": 0.2376, | |
| "step": 11008 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.6305315494537354, | |
| "learning_rate": 1.7210052950821276e-06, | |
| "loss": 0.2274, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.4924837350845337, | |
| "learning_rate": 1.7056203277961112e-06, | |
| "loss": 0.2301, | |
| "step": 11072 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.7117606401443481, | |
| "learning_rate": 1.6902687513039002e-06, | |
| "loss": 0.2391, | |
| "step": 11104 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.9313896894454956, | |
| "learning_rate": 1.6749512108905424e-06, | |
| "loss": 0.2268, | |
| "step": 11136 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.4171528816223145, | |
| "learning_rate": 1.6596683504104228e-06, | |
| "loss": 0.2327, | |
| "step": 11168 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.576985478401184, | |
| "learning_rate": 1.6444208122601933e-06, | |
| "loss": 0.2352, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.453007698059082, | |
| "learning_rate": 1.6292092373517765e-06, | |
| "loss": 0.2315, | |
| "step": 11232 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.5981701612472534, | |
| "learning_rate": 1.6140342650854218e-06, | |
| "loss": 0.2304, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.0827910900115967, | |
| "learning_rate": 1.5988965333228323e-06, | |
| "loss": 0.2338, | |
| "step": 11296 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.5926817655563354, | |
| "learning_rate": 1.5837966783603494e-06, | |
| "loss": 0.2498, | |
| "step": 11328 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.82351815700531, | |
| "learning_rate": 1.5687353349022114e-06, | |
| "loss": 0.2379, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.5781277418136597, | |
| "learning_rate": 1.5537131360338697e-06, | |
| "loss": 0.2421, | |
| "step": 11392 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.4788808822631836, | |
| "learning_rate": 1.5387307131953793e-06, | |
| "loss": 0.2371, | |
| "step": 11424 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.8411000967025757, | |
| "learning_rate": 1.5237886961548615e-06, | |
| "loss": 0.2371, | |
| "step": 11456 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.7383184432983398, | |
| "learning_rate": 1.508887712982024e-06, | |
| "loss": 0.2307, | |
| "step": 11488 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.6967484951019287, | |
| "learning_rate": 1.4940283900217667e-06, | |
| "loss": 0.1677, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.5547412633895874, | |
| "learning_rate": 1.4792113518678553e-06, | |
| "loss": 0.1562, | |
| "step": 11552 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.484421730041504, | |
| "learning_rate": 1.46443722133666e-06, | |
| "loss": 0.1644, | |
| "step": 11584 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.7713335752487183, | |
| "learning_rate": 1.4497066194409849e-06, | |
| "loss": 0.1703, | |
| "step": 11616 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.6081045866012573, | |
| "learning_rate": 1.435020165363956e-06, | |
| "loss": 0.1621, | |
| "step": 11648 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.6781724691390991, | |
| "learning_rate": 1.4203784764330033e-06, | |
| "loss": 0.1706, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.5613493919372559, | |
| "learning_rate": 1.4057821680939049e-06, | |
| "loss": 0.1594, | |
| "step": 11712 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.6171530485153198, | |
| "learning_rate": 1.3912318538849207e-06, | |
| "loss": 0.1598, | |
| "step": 11744 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.4825495481491089, | |
| "learning_rate": 1.3767281454110037e-06, | |
| "loss": 0.1738, | |
| "step": 11776 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.7642580270767212, | |
| "learning_rate": 1.3622716523180898e-06, | |
| "loss": 0.1563, | |
| "step": 11808 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.3485716581344604, | |
| "learning_rate": 1.347862982267475e-06, | |
| "loss": 0.1571, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.6963993310928345, | |
| "learning_rate": 1.3335027409102663e-06, | |
| "loss": 0.1567, | |
| "step": 11872 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.5955970287322998, | |
| "learning_rate": 1.3191915318619357e-06, | |
| "loss": 0.1675, | |
| "step": 11904 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.1548969745635986, | |
| "learning_rate": 1.304929956676938e-06, | |
| "loss": 0.1666, | |
| "step": 11936 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.442049503326416, | |
| "learning_rate": 1.2907186148234246e-06, | |
| "loss": 0.1505, | |
| "step": 11968 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.654685139656067, | |
| "learning_rate": 1.276558103658057e-06, | |
| "loss": 0.1642, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.396730422973633, | |
| "learning_rate": 1.262449018400883e-06, | |
| "loss": 0.1641, | |
| "step": 12032 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.4008959531784058, | |
| "learning_rate": 1.248391952110327e-06, | |
| "loss": 0.1631, | |
| "step": 12064 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.4454419612884521, | |
| "learning_rate": 1.2343874956582586e-06, | |
| "loss": 0.1599, | |
| "step": 12096 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.054921865463257, | |
| "learning_rate": 1.2204362377051562e-06, | |
| "loss": 0.1639, | |
| "step": 12128 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.5172858238220215, | |
| "learning_rate": 1.2065387646753637e-06, | |
| "loss": 0.159, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.640461802482605, | |
| "learning_rate": 1.192695660732439e-06, | |
| "loss": 0.1607, | |
| "step": 12192 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.2705048322677612, | |
| "learning_rate": 1.1789075077546033e-06, | |
| "loss": 0.1691, | |
| "step": 12224 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.6255841255187988, | |
| "learning_rate": 1.1651748853102757e-06, | |
| "loss": 0.1572, | |
| "step": 12256 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.617698073387146, | |
| "learning_rate": 1.1514983706337212e-06, | |
| "loss": 0.1664, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.176952362060547, | |
| "learning_rate": 1.137878538600781e-06, | |
| "loss": 0.1642, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.0179526805877686, | |
| "learning_rate": 1.1243159617047051e-06, | |
| "loss": 0.1555, | |
| "step": 12352 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.6655995845794678, | |
| "learning_rate": 1.1108112100321002e-06, | |
| "loss": 0.1704, | |
| "step": 12384 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.541496992111206, | |
| "learning_rate": 1.0973648512389526e-06, | |
| "loss": 0.1611, | |
| "step": 12416 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.7085323333740234, | |
| "learning_rate": 1.0839774505267777e-06, | |
| "loss": 0.1663, | |
| "step": 12448 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.993090033531189, | |
| "learning_rate": 1.0706495706188584e-06, | |
| "loss": 0.1569, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.4851964712142944, | |
| "learning_rate": 1.0573817717365914e-06, | |
| "loss": 0.1651, | |
| "step": 12512 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.714492917060852, | |
| "learning_rate": 1.0441746115759407e-06, | |
| "loss": 0.1572, | |
| "step": 12544 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.7523784637451172, | |
| "learning_rate": 1.031028645283994e-06, | |
| "loss": 0.1736, | |
| "step": 12576 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.5587310791015625, | |
| "learning_rate": 1.0179444254356294e-06, | |
| "loss": 0.168, | |
| "step": 12608 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.5781267881393433, | |
| "learning_rate": 1.004922502010284e-06, | |
| "loss": 0.1606, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.6072574853897095, | |
| "learning_rate": 9.919634223688452e-07, | |
| "loss": 0.1614, | |
| "step": 12672 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.9626734256744385, | |
| "learning_rate": 9.790677312306346e-07, | |
| "loss": 0.1711, | |
| "step": 12704 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.87946355342865, | |
| "learning_rate": 9.662359706505113e-07, | |
| "loss": 0.1652, | |
| "step": 12736 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 2.207897186279297, | |
| "learning_rate": 9.534686799960977e-07, | |
| "loss": 0.1558, | |
| "step": 12768 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.8241429328918457, | |
| "learning_rate": 9.407663959250932e-07, | |
| "loss": 0.1572, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.6716265678405762, | |
| "learning_rate": 9.281296523627276e-07, | |
| "loss": 0.1558, | |
| "step": 12832 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.7584782838821411, | |
| "learning_rate": 9.15558980479313e-07, | |
| "loss": 0.1532, | |
| "step": 12864 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.6568748950958252, | |
| "learning_rate": 9.030549086679188e-07, | |
| "loss": 0.167, | |
| "step": 12896 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.6095608472824097, | |
| "learning_rate": 8.906179625221597e-07, | |
| "loss": 0.1585, | |
| "step": 12928 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.4875459671020508, | |
| "learning_rate": 8.782486648141042e-07, | |
| "loss": 0.1626, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.884362816810608, | |
| "learning_rate": 8.659475354723007e-07, | |
| "loss": 0.1596, | |
| "step": 12992 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.296562433242798, | |
| "learning_rate": 8.53715091559919e-07, | |
| "loss": 0.1652, | |
| "step": 13024 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.080160140991211, | |
| "learning_rate": 8.415518472530251e-07, | |
| "loss": 0.1535, | |
| "step": 13056 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.603214979171753, | |
| "learning_rate": 8.294583138189597e-07, | |
| "loss": 0.1699, | |
| "step": 13088 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.718169927597046, | |
| "learning_rate": 8.174349995948483e-07, | |
| "loss": 0.1688, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.613429307937622, | |
| "learning_rate": 8.054824099662429e-07, | |
| "loss": 0.1643, | |
| "step": 13152 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.3368054628372192, | |
| "learning_rate": 7.936010473458653e-07, | |
| "loss": 0.1556, | |
| "step": 13184 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.404004454612732, | |
| "learning_rate": 7.817914111524999e-07, | |
| "loss": 0.1588, | |
| "step": 13216 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.7263784408569336, | |
| "learning_rate": 7.700539977899962e-07, | |
| "loss": 0.1525, | |
| "step": 13248 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.7289042472839355, | |
| "learning_rate": 7.583893006264035e-07, | |
| "loss": 0.1633, | |
| "step": 13280 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.5951069593429565, | |
| "learning_rate": 7.467978099732331e-07, | |
| "loss": 0.1539, | |
| "step": 13312 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.1001362800598145, | |
| "learning_rate": 7.352800130648494e-07, | |
| "loss": 0.1627, | |
| "step": 13344 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.5156986713409424, | |
| "learning_rate": 7.238363940379881e-07, | |
| "loss": 0.156, | |
| "step": 13376 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.8548262119293213, | |
| "learning_rate": 7.124674339114071e-07, | |
| "loss": 0.1758, | |
| "step": 13408 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.5202828645706177, | |
| "learning_rate": 7.011736105656675e-07, | |
| "loss": 0.1562, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.8769176006317139, | |
| "learning_rate": 6.89955398723047e-07, | |
| "loss": 0.1598, | |
| "step": 13472 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.5129636526107788, | |
| "learning_rate": 6.788132699275813e-07, | |
| "loss": 0.1675, | |
| "step": 13504 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.8350058794021606, | |
| "learning_rate": 6.677476925252524e-07, | |
| "loss": 0.1658, | |
| "step": 13536 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.9995418787002563, | |
| "learning_rate": 6.567591316442911e-07, | |
| "loss": 0.1722, | |
| "step": 13568 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.6814707517623901, | |
| "learning_rate": 6.458480491756347e-07, | |
| "loss": 0.1615, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.6658669710159302, | |
| "learning_rate": 6.350149037535075e-07, | |
| "loss": 0.1544, | |
| "step": 13632 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.5126328468322754, | |
| "learning_rate": 6.242601507361442e-07, | |
| "loss": 0.1577, | |
| "step": 13664 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.7326850891113281, | |
| "learning_rate": 6.135842421866486e-07, | |
| "loss": 0.1538, | |
| "step": 13696 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.641743779182434, | |
| "learning_rate": 6.029876268539925e-07, | |
| "loss": 0.1612, | |
| "step": 13728 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.7537841796875, | |
| "learning_rate": 5.924707501541527e-07, | |
| "loss": 0.1607, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.8395192623138428, | |
| "learning_rate": 5.820340541513886e-07, | |
| "loss": 0.1621, | |
| "step": 13792 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.7613660097122192, | |
| "learning_rate": 5.7167797753966e-07, | |
| "loss": 0.1586, | |
| "step": 13824 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.8884879350662231, | |
| "learning_rate": 5.61402955624189e-07, | |
| "loss": 0.1639, | |
| "step": 13856 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 1.3437737226486206, | |
| "learning_rate": 5.512094203031576e-07, | |
| "loss": 0.1537, | |
| "step": 13888 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 1.8541299104690552, | |
| "learning_rate": 5.410978000495621e-07, | |
| "loss": 0.1609, | |
| "step": 13920 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.6411800384521484, | |
| "learning_rate": 5.310685198931926e-07, | |
| "loss": 0.1597, | |
| "step": 13952 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.4559264183044434, | |
| "learning_rate": 5.211220014027746e-07, | |
| "loss": 0.1563, | |
| "step": 13984 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.6533401012420654, | |
| "learning_rate": 5.112586626682467e-07, | |
| "loss": 0.1614, | |
| "step": 14016 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.519086241722107, | |
| "learning_rate": 5.014789182831858e-07, | |
| "loss": 0.1649, | |
| "step": 14048 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.6548802852630615, | |
| "learning_rate": 4.917831793273814e-07, | |
| "loss": 0.1579, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.4974662065505981, | |
| "learning_rate": 4.821718533495553e-07, | |
| "loss": 0.1581, | |
| "step": 14112 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.5565108060836792, | |
| "learning_rate": 4.7264534435023186e-07, | |
| "loss": 0.1664, | |
| "step": 14144 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.8209164142608643, | |
| "learning_rate": 4.6320405276475524e-07, | |
| "loss": 0.1649, | |
| "step": 14176 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.8029505014419556, | |
| "learning_rate": 4.5384837544645956e-07, | |
| "loss": 0.1691, | |
| "step": 14208 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.0014963150024414, | |
| "learning_rate": 4.445787056499826e-07, | |
| "loss": 0.1588, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.701003909111023, | |
| "learning_rate": 4.3539543301474446e-07, | |
| "loss": 0.1653, | |
| "step": 14272 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.8511948585510254, | |
| "learning_rate": 4.262989435485615e-07, | |
| "loss": 0.1554, | |
| "step": 14304 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.2707128524780273, | |
| "learning_rate": 4.172896196114234e-07, | |
| "loss": 0.1555, | |
| "step": 14336 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.784833550453186, | |
| "learning_rate": 4.083678398994237e-07, | |
| "loss": 0.1541, | |
| "step": 14368 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 2.091581106185913, | |
| "learning_rate": 3.995339794288383e-07, | |
| "loss": 0.157, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.475711703300476, | |
| "learning_rate": 3.9078840952036455e-07, | |
| "loss": 0.1652, | |
| "step": 14432 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.0263583660125732, | |
| "learning_rate": 3.8213149778351164e-07, | |
| "loss": 0.163, | |
| "step": 14464 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.084484100341797, | |
| "learning_rate": 3.73563608101149e-07, | |
| "loss": 0.1589, | |
| "step": 14496 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 1.5741300582885742, | |
| "learning_rate": 3.65085100614212e-07, | |
| "loss": 0.1547, | |
| "step": 14528 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.7066149711608887, | |
| "learning_rate": 3.566963317065622e-07, | |
| "loss": 0.1628, | |
| "step": 14560 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.4174087047576904, | |
| "learning_rate": 3.483976539900083e-07, | |
| "loss": 0.1589, | |
| "step": 14592 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.6838244199752808, | |
| "learning_rate": 3.401894162894828e-07, | |
| "loss": 0.1492, | |
| "step": 14624 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.6532583236694336, | |
| "learning_rate": 3.320719636283837e-07, | |
| "loss": 0.1589, | |
| "step": 14656 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.447808861732483, | |
| "learning_rate": 3.240456372140674e-07, | |
| "loss": 0.1633, | |
| "step": 14688 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.8401908874511719, | |
| "learning_rate": 3.161107744235067e-07, | |
| "loss": 0.1607, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 1.8145461082458496, | |
| "learning_rate": 3.082677087891148e-07, | |
| "loss": 0.1528, | |
| "step": 14752 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 2.0109219551086426, | |
| "learning_rate": 3.0051676998471807e-07, | |
| "loss": 0.1611, | |
| "step": 14784 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 1.9445598125457764, | |
| "learning_rate": 2.9285828381170443e-07, | |
| "loss": 0.1545, | |
| "step": 14816 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.956513524055481, | |
| "learning_rate": 2.852925721853264e-07, | |
| "loss": 0.1515, | |
| "step": 14848 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.5658009052276611, | |
| "learning_rate": 2.7781995312117005e-07, | |
| "loss": 0.1475, | |
| "step": 14880 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.5122560262680054, | |
| "learning_rate": 2.704407407217871e-07, | |
| "loss": 0.1485, | |
| "step": 14912 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.7659705877304077, | |
| "learning_rate": 2.631552451634931e-07, | |
| "loss": 0.1516, | |
| "step": 14944 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 1.709915041923523, | |
| "learning_rate": 2.5596377268332916e-07, | |
| "loss": 0.1578, | |
| "step": 14976 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 1.7999032735824585, | |
| "learning_rate": 2.488666255661873e-07, | |
| "loss": 0.1593, | |
| "step": 15008 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.7251070737838745, | |
| "learning_rate": 2.418641021321097e-07, | |
| "loss": 0.1457, | |
| "step": 15040 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.96919584274292, | |
| "learning_rate": 2.3495649672374442e-07, | |
| "loss": 0.158, | |
| "step": 15072 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.7707350254058838, | |
| "learning_rate": 2.281440996939724e-07, | |
| "loss": 0.1522, | |
| "step": 15104 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.8055440187454224, | |
| "learning_rate": 2.2142719739370876e-07, | |
| "loss": 0.1644, | |
| "step": 15136 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.5484322309494019, | |
| "learning_rate": 2.1480607215985938e-07, | |
| "loss": 0.153, | |
| "step": 15168 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.3656980991363525, | |
| "learning_rate": 2.0828100230345815e-07, | |
| "loss": 0.1666, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.2295897006988525, | |
| "learning_rate": 2.018522620979657e-07, | |
| "loss": 0.1598, | |
| "step": 15232 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.66213858127594, | |
| "learning_rate": 1.95520121767743e-07, | |
| "loss": 0.1458, | |
| "step": 15264 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.7052069902420044, | |
| "learning_rate": 1.8928484747669007e-07, | |
| "loss": 0.1582, | |
| "step": 15296 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 2.455540895462036, | |
| "learning_rate": 1.8314670131706015e-07, | |
| "loss": 0.1626, | |
| "step": 15328 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 1.7177009582519531, | |
| "learning_rate": 1.771059412984427e-07, | |
| "loss": 0.1489, | |
| "step": 15360 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 1.9176840782165527, | |
| "learning_rate": 1.7116282133691624e-07, | |
| "loss": 0.155, | |
| "step": 15392 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.6064426898956299, | |
| "learning_rate": 1.6531759124437967e-07, | |
| "loss": 0.1646, | |
| "step": 15424 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.764961838722229, | |
| "learning_rate": 1.5957049671804753e-07, | |
| "loss": 0.1553, | |
| "step": 15456 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.645261287689209, | |
| "learning_rate": 1.5392177933012258e-07, | |
| "loss": 0.1661, | |
| "step": 15488 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.78240168094635, | |
| "learning_rate": 1.4837167651764573e-07, | |
| "loss": 0.1724, | |
| "step": 15520 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 2.0585949420928955, | |
| "learning_rate": 1.4292042157251023e-07, | |
| "loss": 0.1539, | |
| "step": 15552 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.6157474517822266, | |
| "learning_rate": 1.3756824363165943e-07, | |
| "loss": 0.1574, | |
| "step": 15584 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 1.6896705627441406, | |
| "learning_rate": 1.3231536766745517e-07, | |
| "loss": 0.1629, | |
| "step": 15616 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 2.022658109664917, | |
| "learning_rate": 1.2716201447821763e-07, | |
| "loss": 0.1529, | |
| "step": 15648 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.5828081369400024, | |
| "learning_rate": 1.2210840067894857e-07, | |
| "loss": 0.1633, | |
| "step": 15680 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.9208072423934937, | |
| "learning_rate": 1.1715473869222393e-07, | |
| "loss": 0.159, | |
| "step": 15712 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.5745195150375366, | |
| "learning_rate": 1.123012367392659e-07, | |
| "loss": 0.1641, | |
| "step": 15744 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.5185731649398804, | |
| "learning_rate": 1.0754809883118916e-07, | |
| "loss": 0.1553, | |
| "step": 15776 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.097116470336914, | |
| "learning_rate": 1.0289552476042768e-07, | |
| "loss": 0.1502, | |
| "step": 15808 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.7868729829788208, | |
| "learning_rate": 9.83437100923354e-08, | |
| "loss": 0.1576, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.7661504745483398, | |
| "learning_rate": 9.389284615696464e-08, | |
| "loss": 0.1658, | |
| "step": 15872 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.692753553390503, | |
| "learning_rate": 8.954312004102711e-08, | |
| "loss": 0.1586, | |
| "step": 15904 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.4457049369812012, | |
| "learning_rate": 8.529471458002648e-08, | |
| "loss": 0.1659, | |
| "step": 15936 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.9470157623291016, | |
| "learning_rate": 8.114780835057456e-08, | |
| "loss": 0.151, | |
| "step": 15968 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 1.5848197937011719, | |
| "learning_rate": 7.710257566288681e-08, | |
| "loss": 0.1512, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.0228586196899414, | |
| "learning_rate": 7.315918655345117e-08, | |
| "loss": 0.1622, | |
| "step": 16032 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.6379644870758057, | |
| "learning_rate": 6.931780677788546e-08, | |
| "loss": 0.1609, | |
| "step": 16064 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.4579676389694214, | |
| "learning_rate": 6.557859780396663e-08, | |
| "loss": 0.1578, | |
| "step": 16096 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.8530055284500122, | |
| "learning_rate": 6.194171680484556e-08, | |
| "loss": 0.1566, | |
| "step": 16128 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.0496585369110107, | |
| "learning_rate": 5.8407316652438764e-08, | |
| "loss": 0.1613, | |
| "step": 16160 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.8340975046157837, | |
| "learning_rate": 5.4975545911005176e-08, | |
| "loss": 0.1598, | |
| "step": 16192 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 2.1443088054656982, | |
| "learning_rate": 5.164654883089926e-08, | |
| "loss": 0.1536, | |
| "step": 16224 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.8571709394454956, | |
| "learning_rate": 4.842046534250716e-08, | |
| "loss": 0.1568, | |
| "step": 16256 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.0833983421325684, | |
| "learning_rate": 4.529743105036844e-08, | |
| "loss": 0.1702, | |
| "step": 16288 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.618950605392456, | |
| "learning_rate": 4.227757722747139e-08, | |
| "loss": 0.1471, | |
| "step": 16320 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.9389454126358032, | |
| "learning_rate": 3.9361030809738074e-08, | |
| "loss": 0.1563, | |
| "step": 16352 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.731814980506897, | |
| "learning_rate": 3.6547914390688835e-08, | |
| "loss": 0.159, | |
| "step": 16384 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 2.0916991233825684, | |
| "learning_rate": 3.3838346216287785e-08, | |
| "loss": 0.1531, | |
| "step": 16416 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.8886840343475342, | |
| "learning_rate": 3.1232440179972954e-08, | |
| "loss": 0.1494, | |
| "step": 16448 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 1.890721321105957, | |
| "learning_rate": 2.8730305817869786e-08, | |
| "loss": 0.1544, | |
| "step": 16480 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.5375641584396362, | |
| "learning_rate": 2.6332048304185677e-08, | |
| "loss": 0.1577, | |
| "step": 16512 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.7129803895950317, | |
| "learning_rate": 2.40377684467899e-08, | |
| "loss": 0.161, | |
| "step": 16544 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.6450425386428833, | |
| "learning_rate": 2.1847562682976166e-08, | |
| "loss": 0.1541, | |
| "step": 16576 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.6297754049301147, | |
| "learning_rate": 1.976152307540863e-08, | |
| "loss": 0.1613, | |
| "step": 16608 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.9899592399597168, | |
| "learning_rate": 1.777973730825222e-08, | |
| "loss": 0.1499, | |
| "step": 16640 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.3760799169540405, | |
| "learning_rate": 1.590228868348781e-08, | |
| "loss": 0.1545, | |
| "step": 16672 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 2.1230156421661377, | |
| "learning_rate": 1.4129256117409451e-08, | |
| "loss": 0.1669, | |
| "step": 16704 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 1.7051235437393188, | |
| "learning_rate": 1.2460714137307594e-08, | |
| "loss": 0.1577, | |
| "step": 16736 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.5741279125213623, | |
| "learning_rate": 1.08967328783377e-08, | |
| "loss": 0.1626, | |
| "step": 16768 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.71488356590271, | |
| "learning_rate": 9.437378080569825e-09, | |
| "loss": 0.1576, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.9407358169555664, | |
| "learning_rate": 8.082711086226936e-09, | |
| "loss": 0.1656, | |
| "step": 16832 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 2.023576259613037, | |
| "learning_rate": 6.832788837106974e-09, | |
| "loss": 0.2152, | |
| "step": 16864 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.999402403831482, | |
| "learning_rate": 5.687663872187555e-09, | |
| "loss": 0.1555, | |
| "step": 16896 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.8923932313919067, | |
| "learning_rate": 4.647384325418835e-09, | |
| "loss": 0.1519, | |
| "step": 16928 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.6705876588821411, | |
| "learning_rate": 3.7119939237001412e-09, | |
| "loss": 0.1514, | |
| "step": 16960 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.5758410692214966, | |
| "learning_rate": 2.8815319850414303e-09, | |
| "loss": 0.1642, | |
| "step": 16992 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.5247572660446167, | |
| "learning_rate": 2.1560334169112852e-09, | |
| "loss": 0.1592, | |
| "step": 17024 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.464015007019043, | |
| "learning_rate": 1.5355287147694742e-09, | |
| "loss": 0.1534, | |
| "step": 17056 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.6785483360290527, | |
| "learning_rate": 1.020043960784367e-09, | |
| "loss": 0.162, | |
| "step": 17088 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.7320752143859863, | |
| "learning_rate": 6.096008227371441e-10, | |
| "loss": 0.1565, | |
| "step": 17120 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.4247487783432007, | |
| "learning_rate": 3.042165531116914e-10, | |
| "loss": 0.1616, | |
| "step": 17152 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.5723894834518433, | |
| "learning_rate": 1.0390398836851446e-10, | |
| "loss": 0.1567, | |
| "step": 17184 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.970982551574707, | |
| "learning_rate": 8.671548404615149e-12, | |
| "loss": 0.1556, | |
| "step": 17216 | |
| } | |
| ], | |
| "logging_steps": 32, | |
| "max_steps": 17229, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 5743, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |