{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 454.58181818181816, "eval_steps": 10, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09696969696969697, "grad_norm": 0.1921696960926056, "learning_rate": 4.999e-05, "loss": 2.8765, "step": 1 }, { "epoch": 0.19393939393939394, "grad_norm": 0.19308380782604218, "learning_rate": 4.9980000000000006e-05, "loss": 2.8673, "step": 2 }, { "epoch": 0.2909090909090909, "grad_norm": 0.19083072245121002, "learning_rate": 4.997e-05, "loss": 2.8733, "step": 3 }, { "epoch": 0.3878787878787879, "grad_norm": 0.19004836678504944, "learning_rate": 4.996e-05, "loss": 2.892, "step": 4 }, { "epoch": 0.48484848484848486, "grad_norm": 0.18908675014972687, "learning_rate": 4.995e-05, "loss": 2.88, "step": 5 }, { "epoch": 0.5818181818181818, "grad_norm": 0.19075514376163483, "learning_rate": 4.9940000000000006e-05, "loss": 2.8799, "step": 6 }, { "epoch": 0.6787878787878788, "grad_norm": 0.19584377110004425, "learning_rate": 4.9930000000000005e-05, "loss": 2.8752, "step": 7 }, { "epoch": 0.7757575757575758, "grad_norm": 0.19357410073280334, "learning_rate": 4.992e-05, "loss": 2.8753, "step": 8 }, { "epoch": 0.8727272727272727, "grad_norm": 0.193776935338974, "learning_rate": 4.991e-05, "loss": 2.8731, "step": 9 }, { "epoch": 0.9696969696969697, "grad_norm": 0.19457189738750458, "learning_rate": 4.99e-05, "loss": 2.8538, "step": 10 }, { "epoch": 0.9696969696969697, "eval_loss": 2.8589916229248047, "eval_runtime": 2.0031, "eval_samples_per_second": 27.458, "eval_steps_per_second": 3.495, "step": 10 }, { "epoch": 1.0, "grad_norm": 0.191086545586586, "learning_rate": 4.9890000000000005e-05, "loss": 2.8476, "step": 11 }, { "epoch": 1.096969696969697, "grad_norm": 0.19667789340019226, "learning_rate": 4.9880000000000004e-05, "loss": 2.8696, "step": 12 }, { "epoch": 1.1939393939393939, "grad_norm": 0.2008863240480423, "learning_rate": 4.987e-05, "loss": 2.8536, "step": 13 }, { "epoch": 1.290909090909091, "grad_norm": 0.19403468072414398, "learning_rate": 4.986e-05, "loss": 2.8426, "step": 14 }, { "epoch": 1.387878787878788, "grad_norm": 0.2005588710308075, "learning_rate": 4.9850000000000006e-05, "loss": 2.8555, "step": 15 }, { "epoch": 1.4848484848484849, "grad_norm": 0.20173998177051544, "learning_rate": 4.9840000000000004e-05, "loss": 2.852, "step": 16 }, { "epoch": 1.5818181818181818, "grad_norm": 0.19781255722045898, "learning_rate": 4.983e-05, "loss": 2.8481, "step": 17 }, { "epoch": 1.6787878787878787, "grad_norm": 0.19945280253887177, "learning_rate": 4.982e-05, "loss": 2.8432, "step": 18 }, { "epoch": 1.7757575757575759, "grad_norm": 0.2086201161146164, "learning_rate": 4.981e-05, "loss": 2.8603, "step": 19 }, { "epoch": 1.8727272727272726, "grad_norm": 0.20707911252975464, "learning_rate": 4.9800000000000004e-05, "loss": 2.8205, "step": 20 }, { "epoch": 1.8727272727272726, "eval_loss": 2.8336637020111084, "eval_runtime": 2.097, "eval_samples_per_second": 26.228, "eval_steps_per_second": 3.338, "step": 20 }, { "epoch": 1.9696969696969697, "grad_norm": 0.20440326631069183, "learning_rate": 4.979e-05, "loss": 2.8464, "step": 21 }, { "epoch": 2.0, "grad_norm": 0.21748438477516174, "learning_rate": 4.978e-05, "loss": 2.8079, "step": 22 }, { "epoch": 2.096969696969697, "grad_norm": 0.21118205785751343, "learning_rate": 4.977e-05, "loss": 2.8453, "step": 23 }, { "epoch": 2.193939393939394, "grad_norm": 0.2079344540834427, "learning_rate": 4.976e-05, "loss": 2.8295, "step": 24 }, { "epoch": 2.290909090909091, "grad_norm": 0.21478907763957977, "learning_rate": 4.975e-05, "loss": 2.8252, "step": 25 }, { "epoch": 2.3878787878787877, "grad_norm": 0.21348024904727936, "learning_rate": 4.974e-05, "loss": 2.8124, "step": 26 }, { "epoch": 2.484848484848485, "grad_norm": 0.22032339870929718, "learning_rate": 4.973000000000001e-05, "loss": 2.8064, "step": 27 }, { "epoch": 2.581818181818182, "grad_norm": 0.21923890709877014, "learning_rate": 4.972e-05, "loss": 2.8035, "step": 28 }, { "epoch": 2.6787878787878787, "grad_norm": 0.22203513979911804, "learning_rate": 4.9710000000000003e-05, "loss": 2.8098, "step": 29 }, { "epoch": 2.775757575757576, "grad_norm": 0.22282050549983978, "learning_rate": 4.97e-05, "loss": 2.8159, "step": 30 }, { "epoch": 2.775757575757576, "eval_loss": 2.804253339767456, "eval_runtime": 2.1435, "eval_samples_per_second": 25.659, "eval_steps_per_second": 3.266, "step": 30 }, { "epoch": 2.8727272727272726, "grad_norm": 0.22722773253917694, "learning_rate": 4.969e-05, "loss": 2.8072, "step": 31 }, { "epoch": 2.9696969696969697, "grad_norm": 0.2269333004951477, "learning_rate": 4.9680000000000005e-05, "loss": 2.8199, "step": 32 }, { "epoch": 3.0, "grad_norm": 0.23603735864162445, "learning_rate": 4.967e-05, "loss": 2.8051, "step": 33 }, { "epoch": 3.096969696969697, "grad_norm": 0.226282000541687, "learning_rate": 4.966e-05, "loss": 2.8064, "step": 34 }, { "epoch": 3.193939393939394, "grad_norm": 0.23394860327243805, "learning_rate": 4.965e-05, "loss": 2.8158, "step": 35 }, { "epoch": 3.290909090909091, "grad_norm": 0.22885440289974213, "learning_rate": 4.9640000000000006e-05, "loss": 2.7904, "step": 36 }, { "epoch": 3.3878787878787877, "grad_norm": 0.23805929720401764, "learning_rate": 4.9630000000000004e-05, "loss": 2.7877, "step": 37 }, { "epoch": 3.484848484848485, "grad_norm": 0.23020139336585999, "learning_rate": 4.962e-05, "loss": 2.787, "step": 38 }, { "epoch": 3.581818181818182, "grad_norm": 0.2334854155778885, "learning_rate": 4.961e-05, "loss": 2.7586, "step": 39 }, { "epoch": 3.6787878787878787, "grad_norm": 0.23337151110172272, "learning_rate": 4.96e-05, "loss": 2.7849, "step": 40 }, { "epoch": 3.6787878787878787, "eval_loss": 2.7703146934509277, "eval_runtime": 2.1916, "eval_samples_per_second": 25.096, "eval_steps_per_second": 3.194, "step": 40 }, { "epoch": 3.775757575757576, "grad_norm": 0.23387908935546875, "learning_rate": 4.9590000000000005e-05, "loss": 2.7605, "step": 41 }, { "epoch": 3.8727272727272726, "grad_norm": 0.23480385541915894, "learning_rate": 4.958e-05, "loss": 2.7698, "step": 42 }, { "epoch": 3.9696969696969697, "grad_norm": 0.23578350245952606, "learning_rate": 4.957e-05, "loss": 2.7539, "step": 43 }, { "epoch": 4.0, "grad_norm": 0.24619624018669128, "learning_rate": 4.956e-05, "loss": 2.769, "step": 44 }, { "epoch": 4.096969696969697, "grad_norm": 0.23596659302711487, "learning_rate": 4.9550000000000005e-05, "loss": 2.7477, "step": 45 }, { "epoch": 4.193939393939394, "grad_norm": 0.23750188946723938, "learning_rate": 4.9540000000000003e-05, "loss": 2.7477, "step": 46 }, { "epoch": 4.290909090909091, "grad_norm": 0.24240151047706604, "learning_rate": 4.953e-05, "loss": 2.7516, "step": 47 }, { "epoch": 4.387878787878788, "grad_norm": 0.23533636331558228, "learning_rate": 4.952e-05, "loss": 2.7578, "step": 48 }, { "epoch": 4.484848484848484, "grad_norm": 0.2335629016160965, "learning_rate": 4.951e-05, "loss": 2.7476, "step": 49 }, { "epoch": 4.581818181818182, "grad_norm": 0.2371414303779602, "learning_rate": 4.9500000000000004e-05, "loss": 2.7223, "step": 50 }, { "epoch": 4.581818181818182, "eval_loss": 2.7336995601654053, "eval_runtime": 2.1993, "eval_samples_per_second": 25.008, "eval_steps_per_second": 3.183, "step": 50 }, { "epoch": 4.678787878787879, "grad_norm": 0.2299228310585022, "learning_rate": 4.949e-05, "loss": 2.756, "step": 51 }, { "epoch": 4.775757575757575, "grad_norm": 0.23399955034255981, "learning_rate": 4.948000000000001e-05, "loss": 2.7664, "step": 52 }, { "epoch": 4.872727272727273, "grad_norm": 0.23219211399555206, "learning_rate": 4.947e-05, "loss": 2.7277, "step": 53 }, { "epoch": 4.96969696969697, "grad_norm": 0.23143132030963898, "learning_rate": 4.946e-05, "loss": 2.6945, "step": 54 }, { "epoch": 5.0, "grad_norm": 0.24724680185317993, "learning_rate": 4.945e-05, "loss": 2.7077, "step": 55 }, { "epoch": 5.096969696969697, "grad_norm": 0.23197874426841736, "learning_rate": 4.944e-05, "loss": 2.7047, "step": 56 }, { "epoch": 5.193939393939394, "grad_norm": 0.23464535176753998, "learning_rate": 4.9430000000000006e-05, "loss": 2.7158, "step": 57 }, { "epoch": 5.290909090909091, "grad_norm": 0.23159249126911163, "learning_rate": 4.942e-05, "loss": 2.7076, "step": 58 }, { "epoch": 5.387878787878788, "grad_norm": 0.23467811942100525, "learning_rate": 4.941e-05, "loss": 2.7272, "step": 59 }, { "epoch": 5.484848484848484, "grad_norm": 0.23297619819641113, "learning_rate": 4.94e-05, "loss": 2.6964, "step": 60 }, { "epoch": 5.484848484848484, "eval_loss": 2.694591760635376, "eval_runtime": 2.2142, "eval_samples_per_second": 24.84, "eval_steps_per_second": 3.161, "step": 60 }, { "epoch": 5.581818181818182, "grad_norm": 0.23715738952159882, "learning_rate": 4.939e-05, "loss": 2.6981, "step": 61 }, { "epoch": 5.678787878787879, "grad_norm": 0.23313502967357635, "learning_rate": 4.9380000000000005e-05, "loss": 2.6859, "step": 62 }, { "epoch": 5.775757575757575, "grad_norm": 0.23508723080158234, "learning_rate": 4.937e-05, "loss": 2.6977, "step": 63 }, { "epoch": 5.872727272727273, "grad_norm": 0.2312891036272049, "learning_rate": 4.936e-05, "loss": 2.6921, "step": 64 }, { "epoch": 5.96969696969697, "grad_norm": 0.232807457447052, "learning_rate": 4.935e-05, "loss": 2.6618, "step": 65 }, { "epoch": 6.0, "grad_norm": 0.23697008192539215, "learning_rate": 4.9340000000000005e-05, "loss": 2.6795, "step": 66 }, { "epoch": 6.096969696969697, "grad_norm": 0.23080474138259888, "learning_rate": 4.9330000000000004e-05, "loss": 2.6681, "step": 67 }, { "epoch": 6.193939393939394, "grad_norm": 0.2305583357810974, "learning_rate": 4.932e-05, "loss": 2.6906, "step": 68 }, { "epoch": 6.290909090909091, "grad_norm": 0.2397168129682541, "learning_rate": 4.931e-05, "loss": 2.6724, "step": 69 }, { "epoch": 6.387878787878788, "grad_norm": 0.23490968346595764, "learning_rate": 4.93e-05, "loss": 2.6539, "step": 70 }, { "epoch": 6.387878787878788, "eval_loss": 2.654001474380493, "eval_runtime": 2.1567, "eval_samples_per_second": 25.502, "eval_steps_per_second": 3.246, "step": 70 }, { "epoch": 6.484848484848484, "grad_norm": 0.23402239382266998, "learning_rate": 4.9290000000000004e-05, "loss": 2.6689, "step": 71 }, { "epoch": 6.581818181818182, "grad_norm": 0.23379398882389069, "learning_rate": 4.928e-05, "loss": 2.6511, "step": 72 }, { "epoch": 6.678787878787879, "grad_norm": 0.23643533885478973, "learning_rate": 4.927000000000001e-05, "loss": 2.6132, "step": 73 }, { "epoch": 6.775757575757575, "grad_norm": 0.23277723789215088, "learning_rate": 4.926e-05, "loss": 2.6498, "step": 74 }, { "epoch": 6.872727272727273, "grad_norm": 0.23537500202655792, "learning_rate": 4.9250000000000004e-05, "loss": 2.631, "step": 75 }, { "epoch": 6.96969696969697, "grad_norm": 0.22895337641239166, "learning_rate": 4.924e-05, "loss": 2.6461, "step": 76 }, { "epoch": 7.0, "grad_norm": 0.24864304065704346, "learning_rate": 4.923e-05, "loss": 2.6042, "step": 77 }, { "epoch": 7.096969696969697, "grad_norm": 0.23936185240745544, "learning_rate": 4.9220000000000006e-05, "loss": 2.6338, "step": 78 }, { "epoch": 7.193939393939394, "grad_norm": 0.2343270629644394, "learning_rate": 4.921e-05, "loss": 2.5853, "step": 79 }, { "epoch": 7.290909090909091, "grad_norm": 0.2339862585067749, "learning_rate": 4.92e-05, "loss": 2.6097, "step": 80 }, { "epoch": 7.290909090909091, "eval_loss": 2.6114232540130615, "eval_runtime": 2.154, "eval_samples_per_second": 25.534, "eval_steps_per_second": 3.25, "step": 80 }, { "epoch": 7.387878787878788, "grad_norm": 0.2315565049648285, "learning_rate": 4.919e-05, "loss": 2.6263, "step": 81 }, { "epoch": 7.484848484848484, "grad_norm": 0.2334234118461609, "learning_rate": 4.918000000000001e-05, "loss": 2.6175, "step": 82 }, { "epoch": 7.581818181818182, "grad_norm": 0.23795263469219208, "learning_rate": 4.9170000000000005e-05, "loss": 2.6318, "step": 83 }, { "epoch": 7.678787878787879, "grad_norm": 0.24352413415908813, "learning_rate": 4.9160000000000004e-05, "loss": 2.6086, "step": 84 }, { "epoch": 7.775757575757575, "grad_norm": 0.23900386691093445, "learning_rate": 4.915e-05, "loss": 2.5974, "step": 85 }, { "epoch": 7.872727272727273, "grad_norm": 0.24548974633216858, "learning_rate": 4.914e-05, "loss": 2.5798, "step": 86 }, { "epoch": 7.96969696969697, "grad_norm": 0.2442977875471115, "learning_rate": 4.9130000000000006e-05, "loss": 2.5816, "step": 87 }, { "epoch": 8.0, "grad_norm": 0.24905452132225037, "learning_rate": 4.9120000000000004e-05, "loss": 2.55, "step": 88 }, { "epoch": 8.096969696969698, "grad_norm": 0.23655293881893158, "learning_rate": 4.911e-05, "loss": 2.563, "step": 89 }, { "epoch": 8.193939393939393, "grad_norm": 0.24191634356975555, "learning_rate": 4.91e-05, "loss": 2.585, "step": 90 }, { "epoch": 8.193939393939393, "eval_loss": 2.5664913654327393, "eval_runtime": 2.1414, "eval_samples_per_second": 25.684, "eval_steps_per_second": 3.269, "step": 90 }, { "epoch": 8.290909090909091, "grad_norm": 0.2470662146806717, "learning_rate": 4.9090000000000006e-05, "loss": 2.5824, "step": 91 }, { "epoch": 8.387878787878789, "grad_norm": 0.24474897980690002, "learning_rate": 4.9080000000000004e-05, "loss": 2.5554, "step": 92 }, { "epoch": 8.484848484848484, "grad_norm": 0.2424492985010147, "learning_rate": 4.907e-05, "loss": 2.552, "step": 93 }, { "epoch": 8.581818181818182, "grad_norm": 0.24509666860103607, "learning_rate": 4.906e-05, "loss": 2.5625, "step": 94 }, { "epoch": 8.67878787878788, "grad_norm": 0.2482563704252243, "learning_rate": 4.905e-05, "loss": 2.5262, "step": 95 }, { "epoch": 8.775757575757575, "grad_norm": 0.2569093704223633, "learning_rate": 4.9040000000000005e-05, "loss": 2.5401, "step": 96 }, { "epoch": 8.872727272727273, "grad_norm": 0.2476951777935028, "learning_rate": 4.903e-05, "loss": 2.5603, "step": 97 }, { "epoch": 8.969696969696969, "grad_norm": 0.250521719455719, "learning_rate": 4.902e-05, "loss": 2.5349, "step": 98 }, { "epoch": 9.0, "grad_norm": 0.2586294114589691, "learning_rate": 4.901e-05, "loss": 2.5142, "step": 99 }, { "epoch": 9.096969696969698, "grad_norm": 0.2525769770145416, "learning_rate": 4.9e-05, "loss": 2.5204, "step": 100 }, { "epoch": 9.096969696969698, "eval_loss": 2.5186212062835693, "eval_runtime": 2.1549, "eval_samples_per_second": 25.523, "eval_steps_per_second": 3.248, "step": 100 }, { "epoch": 9.193939393939393, "grad_norm": 0.26334884762763977, "learning_rate": 4.8990000000000004e-05, "loss": 2.5295, "step": 101 }, { "epoch": 9.290909090909091, "grad_norm": 0.25765350461006165, "learning_rate": 4.898e-05, "loss": 2.5146, "step": 102 }, { "epoch": 9.387878787878789, "grad_norm": 0.2612263560295105, "learning_rate": 4.897000000000001e-05, "loss": 2.512, "step": 103 }, { "epoch": 9.484848484848484, "grad_norm": 0.2547607123851776, "learning_rate": 4.896e-05, "loss": 2.5108, "step": 104 }, { "epoch": 9.581818181818182, "grad_norm": 0.2574792504310608, "learning_rate": 4.8950000000000004e-05, "loss": 2.4964, "step": 105 }, { "epoch": 9.67878787878788, "grad_norm": 0.25030583143234253, "learning_rate": 4.894e-05, "loss": 2.488, "step": 106 }, { "epoch": 9.775757575757575, "grad_norm": 0.2604466676712036, "learning_rate": 4.893e-05, "loss": 2.4789, "step": 107 }, { "epoch": 9.872727272727273, "grad_norm": 0.25026217103004456, "learning_rate": 4.8920000000000006e-05, "loss": 2.5173, "step": 108 }, { "epoch": 9.969696969696969, "grad_norm": 0.26027733087539673, "learning_rate": 4.891e-05, "loss": 2.4682, "step": 109 }, { "epoch": 10.0, "grad_norm": 0.2752644717693329, "learning_rate": 4.89e-05, "loss": 2.4277, "step": 110 }, { "epoch": 10.0, "eval_loss": 2.4674863815307617, "eval_runtime": 2.1562, "eval_samples_per_second": 25.508, "eval_steps_per_second": 3.247, "step": 110 }, { "epoch": 10.096969696969698, "grad_norm": 0.2690877318382263, "learning_rate": 4.889e-05, "loss": 2.5009, "step": 111 }, { "epoch": 10.193939393939393, "grad_norm": 0.26510998606681824, "learning_rate": 4.8880000000000006e-05, "loss": 2.4636, "step": 112 }, { "epoch": 10.290909090909091, "grad_norm": 0.26340264081954956, "learning_rate": 4.8870000000000005e-05, "loss": 2.4617, "step": 113 }, { "epoch": 10.387878787878789, "grad_norm": 0.26563766598701477, "learning_rate": 4.886e-05, "loss": 2.4468, "step": 114 }, { "epoch": 10.484848484848484, "grad_norm": 0.2594900131225586, "learning_rate": 4.885e-05, "loss": 2.4572, "step": 115 }, { "epoch": 10.581818181818182, "grad_norm": 0.26996803283691406, "learning_rate": 4.884e-05, "loss": 2.4254, "step": 116 }, { "epoch": 10.67878787878788, "grad_norm": 0.26760613918304443, "learning_rate": 4.8830000000000005e-05, "loss": 2.4523, "step": 117 }, { "epoch": 10.775757575757575, "grad_norm": 0.27425920963287354, "learning_rate": 4.8820000000000004e-05, "loss": 2.4343, "step": 118 }, { "epoch": 10.872727272727273, "grad_norm": 0.27467113733291626, "learning_rate": 4.881e-05, "loss": 2.4095, "step": 119 }, { "epoch": 10.969696969696969, "grad_norm": 0.2764286398887634, "learning_rate": 4.88e-05, "loss": 2.3839, "step": 120 }, { "epoch": 10.969696969696969, "eval_loss": 2.412783145904541, "eval_runtime": 2.1539, "eval_samples_per_second": 25.535, "eval_steps_per_second": 3.25, "step": 120 }, { "epoch": 11.0, "grad_norm": 0.27834805846214294, "learning_rate": 4.8790000000000006e-05, "loss": 2.4266, "step": 121 }, { "epoch": 11.096969696969698, "grad_norm": 0.2820325791835785, "learning_rate": 4.8780000000000004e-05, "loss": 2.4043, "step": 122 }, { "epoch": 11.193939393939393, "grad_norm": 0.28312867879867554, "learning_rate": 4.877e-05, "loss": 2.4158, "step": 123 }, { "epoch": 11.290909090909091, "grad_norm": 0.28191468119621277, "learning_rate": 4.876e-05, "loss": 2.3882, "step": 124 }, { "epoch": 11.387878787878789, "grad_norm": 0.2714318037033081, "learning_rate": 4.875e-05, "loss": 2.3846, "step": 125 }, { "epoch": 11.484848484848484, "grad_norm": 0.27267444133758545, "learning_rate": 4.8740000000000004e-05, "loss": 2.4015, "step": 126 }, { "epoch": 11.581818181818182, "grad_norm": 0.28706297278404236, "learning_rate": 4.873e-05, "loss": 2.3745, "step": 127 }, { "epoch": 11.67878787878788, "grad_norm": 0.2960398495197296, "learning_rate": 4.872000000000001e-05, "loss": 2.3496, "step": 128 }, { "epoch": 11.775757575757575, "grad_norm": 0.2834423780441284, "learning_rate": 4.871e-05, "loss": 2.3626, "step": 129 }, { "epoch": 11.872727272727273, "grad_norm": 0.28782230615615845, "learning_rate": 4.87e-05, "loss": 2.3588, "step": 130 }, { "epoch": 11.872727272727273, "eval_loss": 2.3537094593048096, "eval_runtime": 2.154, "eval_samples_per_second": 25.534, "eval_steps_per_second": 3.25, "step": 130 }, { "epoch": 11.969696969696969, "grad_norm": 0.2884594202041626, "learning_rate": 4.869e-05, "loss": 2.3706, "step": 131 }, { "epoch": 12.0, "grad_norm": 0.302971750497818, "learning_rate": 4.868e-05, "loss": 2.3574, "step": 132 }, { "epoch": 12.096969696969698, "grad_norm": 0.2976221740245819, "learning_rate": 4.867000000000001e-05, "loss": 2.342, "step": 133 }, { "epoch": 12.193939393939393, "grad_norm": 0.2907634973526001, "learning_rate": 4.866e-05, "loss": 2.3474, "step": 134 }, { "epoch": 12.290909090909091, "grad_norm": 0.3033353388309479, "learning_rate": 4.8650000000000003e-05, "loss": 2.3137, "step": 135 }, { "epoch": 12.387878787878789, "grad_norm": 0.3008555769920349, "learning_rate": 4.864e-05, "loss": 2.3045, "step": 136 }, { "epoch": 12.484848484848484, "grad_norm": 0.3030095398426056, "learning_rate": 4.863e-05, "loss": 2.325, "step": 137 }, { "epoch": 12.581818181818182, "grad_norm": 0.302406907081604, "learning_rate": 4.8620000000000005e-05, "loss": 2.3364, "step": 138 }, { "epoch": 12.67878787878788, "grad_norm": 0.3113637864589691, "learning_rate": 4.861e-05, "loss": 2.2955, "step": 139 }, { "epoch": 12.775757575757575, "grad_norm": 0.32233142852783203, "learning_rate": 4.86e-05, "loss": 2.2928, "step": 140 }, { "epoch": 12.775757575757575, "eval_loss": 2.288254499435425, "eval_runtime": 2.1646, "eval_samples_per_second": 25.409, "eval_steps_per_second": 3.234, "step": 140 }, { "epoch": 12.872727272727273, "grad_norm": 0.3165872395038605, "learning_rate": 4.859e-05, "loss": 2.2981, "step": 141 }, { "epoch": 12.969696969696969, "grad_norm": 0.31966012716293335, "learning_rate": 4.8580000000000006e-05, "loss": 2.2695, "step": 142 }, { "epoch": 13.0, "grad_norm": 0.3387593924999237, "learning_rate": 4.8570000000000004e-05, "loss": 2.2426, "step": 143 }, { "epoch": 13.096969696969698, "grad_norm": 0.3338097929954529, "learning_rate": 4.856e-05, "loss": 2.2611, "step": 144 }, { "epoch": 13.193939393939393, "grad_norm": 0.3384833037853241, "learning_rate": 4.855e-05, "loss": 2.2649, "step": 145 }, { "epoch": 13.290909090909091, "grad_norm": 0.32794836163520813, "learning_rate": 4.854e-05, "loss": 2.2717, "step": 146 }, { "epoch": 13.387878787878789, "grad_norm": 0.33190327882766724, "learning_rate": 4.8530000000000005e-05, "loss": 2.2387, "step": 147 }, { "epoch": 13.484848484848484, "grad_norm": 0.3333049714565277, "learning_rate": 4.852e-05, "loss": 2.2707, "step": 148 }, { "epoch": 13.581818181818182, "grad_norm": 0.3635169267654419, "learning_rate": 4.851e-05, "loss": 2.1971, "step": 149 }, { "epoch": 13.67878787878788, "grad_norm": 0.3630567193031311, "learning_rate": 4.85e-05, "loss": 2.2174, "step": 150 }, { "epoch": 13.67878787878788, "eval_loss": 2.2112526893615723, "eval_runtime": 2.1483, "eval_samples_per_second": 25.602, "eval_steps_per_second": 3.258, "step": 150 }, { "epoch": 13.775757575757575, "grad_norm": 0.37875112891197205, "learning_rate": 4.8490000000000005e-05, "loss": 2.1765, "step": 151 }, { "epoch": 13.872727272727273, "grad_norm": 0.3608453869819641, "learning_rate": 4.8480000000000003e-05, "loss": 2.2112, "step": 152 }, { "epoch": 13.969696969696969, "grad_norm": 0.3816240131855011, "learning_rate": 4.847e-05, "loss": 2.1969, "step": 153 }, { "epoch": 14.0, "grad_norm": 0.39412960410118103, "learning_rate": 4.846e-05, "loss": 2.1902, "step": 154 }, { "epoch": 14.096969696969698, "grad_norm": 0.3680770993232727, "learning_rate": 4.845e-05, "loss": 2.1605, "step": 155 }, { "epoch": 14.193939393939393, "grad_norm": 0.348919540643692, "learning_rate": 4.8440000000000004e-05, "loss": 2.1788, "step": 156 }, { "epoch": 14.290909090909091, "grad_norm": 0.3487190306186676, "learning_rate": 4.843e-05, "loss": 2.1753, "step": 157 }, { "epoch": 14.387878787878789, "grad_norm": 0.3936024606227875, "learning_rate": 4.842000000000001e-05, "loss": 2.1322, "step": 158 }, { "epoch": 14.484848484848484, "grad_norm": 0.45259958505630493, "learning_rate": 4.841e-05, "loss": 2.1273, "step": 159 }, { "epoch": 14.581818181818182, "grad_norm": 0.4806135296821594, "learning_rate": 4.8400000000000004e-05, "loss": 2.1488, "step": 160 }, { "epoch": 14.581818181818182, "eval_loss": 2.125792980194092, "eval_runtime": 2.1611, "eval_samples_per_second": 25.45, "eval_steps_per_second": 3.239, "step": 160 }, { "epoch": 14.67878787878788, "grad_norm": 0.4466473162174225, "learning_rate": 4.839e-05, "loss": 2.0891, "step": 161 }, { "epoch": 14.775757575757575, "grad_norm": 0.38130316138267517, "learning_rate": 4.838e-05, "loss": 2.1258, "step": 162 }, { "epoch": 14.872727272727273, "grad_norm": 0.36415424942970276, "learning_rate": 4.8370000000000006e-05, "loss": 2.1096, "step": 163 }, { "epoch": 14.969696969696969, "grad_norm": 0.3677339255809784, "learning_rate": 4.836e-05, "loss": 2.1179, "step": 164 }, { "epoch": 15.0, "grad_norm": 0.4070127308368683, "learning_rate": 4.835e-05, "loss": 2.0842, "step": 165 }, { "epoch": 15.096969696969698, "grad_norm": 0.41008260846138, "learning_rate": 4.834e-05, "loss": 2.1046, "step": 166 }, { "epoch": 15.193939393939393, "grad_norm": 0.4382018744945526, "learning_rate": 4.833e-05, "loss": 2.0424, "step": 167 }, { "epoch": 15.290909090909091, "grad_norm": 0.4082714319229126, "learning_rate": 4.8320000000000005e-05, "loss": 2.0994, "step": 168 }, { "epoch": 15.387878787878789, "grad_norm": 0.3896704912185669, "learning_rate": 4.8309999999999997e-05, "loss": 2.0825, "step": 169 }, { "epoch": 15.484848484848484, "grad_norm": 0.38054773211479187, "learning_rate": 4.83e-05, "loss": 2.0364, "step": 170 }, { "epoch": 15.484848484848484, "eval_loss": 2.0379815101623535, "eval_runtime": 2.1425, "eval_samples_per_second": 25.67, "eval_steps_per_second": 3.267, "step": 170 }, { "epoch": 15.581818181818182, "grad_norm": 0.3640633225440979, "learning_rate": 4.829e-05, "loss": 2.0312, "step": 171 }, { "epoch": 15.67878787878788, "grad_norm": 0.35890400409698486, "learning_rate": 4.8280000000000005e-05, "loss": 2.0358, "step": 172 }, { "epoch": 15.775757575757575, "grad_norm": 0.41220933198928833, "learning_rate": 4.8270000000000004e-05, "loss": 2.0001, "step": 173 }, { "epoch": 15.872727272727273, "grad_norm": 0.4162037968635559, "learning_rate": 4.826e-05, "loss": 2.0126, "step": 174 }, { "epoch": 15.969696969696969, "grad_norm": 0.391644150018692, "learning_rate": 4.825e-05, "loss": 1.9568, "step": 175 }, { "epoch": 16.0, "grad_norm": 0.3766094744205475, "learning_rate": 4.824e-05, "loss": 2.0131, "step": 176 }, { "epoch": 16.096969696969698, "grad_norm": 0.3483814597129822, "learning_rate": 4.8230000000000004e-05, "loss": 1.9951, "step": 177 }, { "epoch": 16.193939393939395, "grad_norm": 0.39426183700561523, "learning_rate": 4.822e-05, "loss": 1.9671, "step": 178 }, { "epoch": 16.29090909090909, "grad_norm": 0.4302196800708771, "learning_rate": 4.821e-05, "loss": 1.933, "step": 179 }, { "epoch": 16.387878787878787, "grad_norm": 0.35731634497642517, "learning_rate": 4.82e-05, "loss": 2.016, "step": 180 }, { "epoch": 16.387878787878787, "eval_loss": 1.9507142305374146, "eval_runtime": 2.1627, "eval_samples_per_second": 25.432, "eval_steps_per_second": 3.237, "step": 180 }, { "epoch": 16.484848484848484, "grad_norm": 0.40772905945777893, "learning_rate": 4.8190000000000004e-05, "loss": 1.9514, "step": 181 }, { "epoch": 16.581818181818182, "grad_norm": 0.37485066056251526, "learning_rate": 4.818e-05, "loss": 1.9189, "step": 182 }, { "epoch": 16.67878787878788, "grad_norm": 0.36008599400520325, "learning_rate": 4.817e-05, "loss": 1.933, "step": 183 }, { "epoch": 16.775757575757577, "grad_norm": 0.376492977142334, "learning_rate": 4.816e-05, "loss": 1.9066, "step": 184 }, { "epoch": 16.87272727272727, "grad_norm": 0.41807034611701965, "learning_rate": 4.815e-05, "loss": 1.9273, "step": 185 }, { "epoch": 16.96969696969697, "grad_norm": 0.3658643066883087, "learning_rate": 4.814e-05, "loss": 1.8925, "step": 186 }, { "epoch": 17.0, "grad_norm": 0.36926499009132385, "learning_rate": 4.813e-05, "loss": 1.8963, "step": 187 }, { "epoch": 17.096969696969698, "grad_norm": 0.34902480244636536, "learning_rate": 4.812000000000001e-05, "loss": 1.9145, "step": 188 }, { "epoch": 17.193939393939395, "grad_norm": 0.41031065583229065, "learning_rate": 4.8110000000000005e-05, "loss": 1.8789, "step": 189 }, { "epoch": 17.29090909090909, "grad_norm": 0.3900549113750458, "learning_rate": 4.8100000000000004e-05, "loss": 1.8519, "step": 190 }, { "epoch": 17.29090909090909, "eval_loss": 1.863952398300171, "eval_runtime": 2.1371, "eval_samples_per_second": 25.736, "eval_steps_per_second": 3.275, "step": 190 }, { "epoch": 17.387878787878787, "grad_norm": 0.38358181715011597, "learning_rate": 4.809e-05, "loss": 1.8534, "step": 191 }, { "epoch": 17.484848484848484, "grad_norm": 0.3578030467033386, "learning_rate": 4.808e-05, "loss": 1.8474, "step": 192 }, { "epoch": 17.581818181818182, "grad_norm": 0.39505457878112793, "learning_rate": 4.8070000000000006e-05, "loss": 1.8345, "step": 193 }, { "epoch": 17.67878787878788, "grad_norm": 0.41133230924606323, "learning_rate": 4.8060000000000004e-05, "loss": 1.8128, "step": 194 }, { "epoch": 17.775757575757577, "grad_norm": 0.3547358214855194, "learning_rate": 4.805e-05, "loss": 1.8453, "step": 195 }, { "epoch": 17.87272727272727, "grad_norm": 0.3567870855331421, "learning_rate": 4.804e-05, "loss": 1.8447, "step": 196 }, { "epoch": 17.96969696969697, "grad_norm": 0.4133602976799011, "learning_rate": 4.8030000000000006e-05, "loss": 1.7853, "step": 197 }, { "epoch": 18.0, "grad_norm": 0.35908421874046326, "learning_rate": 4.8020000000000004e-05, "loss": 1.8628, "step": 198 }, { "epoch": 18.096969696969698, "grad_norm": 0.35534003376960754, "learning_rate": 4.801e-05, "loss": 1.8196, "step": 199 }, { "epoch": 18.193939393939395, "grad_norm": 0.34279873967170715, "learning_rate": 4.8e-05, "loss": 1.7631, "step": 200 }, { "epoch": 18.193939393939395, "eval_loss": 1.7806957960128784, "eval_runtime": 2.1552, "eval_samples_per_second": 25.52, "eval_steps_per_second": 3.248, "step": 200 }, { "epoch": 18.29090909090909, "grad_norm": 0.32960304617881775, "learning_rate": 4.799e-05, "loss": 1.8037, "step": 201 }, { "epoch": 18.387878787878787, "grad_norm": 0.3589719831943512, "learning_rate": 4.7980000000000005e-05, "loss": 1.7687, "step": 202 }, { "epoch": 18.484848484848484, "grad_norm": 0.34671100974082947, "learning_rate": 4.797e-05, "loss": 1.759, "step": 203 }, { "epoch": 18.581818181818182, "grad_norm": 0.3515738546848297, "learning_rate": 4.796e-05, "loss": 1.6964, "step": 204 }, { "epoch": 18.67878787878788, "grad_norm": 0.3296724259853363, "learning_rate": 4.795e-05, "loss": 1.7632, "step": 205 }, { "epoch": 18.775757575757577, "grad_norm": 0.3740043640136719, "learning_rate": 4.794e-05, "loss": 1.7256, "step": 206 }, { "epoch": 18.87272727272727, "grad_norm": 0.3721851110458374, "learning_rate": 4.7930000000000004e-05, "loss": 1.76, "step": 207 }, { "epoch": 18.96969696969697, "grad_norm": 0.3484649360179901, "learning_rate": 4.792e-05, "loss": 1.7084, "step": 208 }, { "epoch": 19.0, "grad_norm": 0.3665394186973572, "learning_rate": 4.791000000000001e-05, "loss": 1.7428, "step": 209 }, { "epoch": 19.096969696969698, "grad_norm": 0.344052791595459, "learning_rate": 4.79e-05, "loss": 1.7106, "step": 210 }, { "epoch": 19.096969696969698, "eval_loss": 1.7033495903015137, "eval_runtime": 2.1657, "eval_samples_per_second": 25.396, "eval_steps_per_second": 3.232, "step": 210 }, { "epoch": 19.193939393939395, "grad_norm": 0.31811317801475525, "learning_rate": 4.7890000000000004e-05, "loss": 1.7242, "step": 211 }, { "epoch": 19.29090909090909, "grad_norm": 0.3290516436100006, "learning_rate": 4.788e-05, "loss": 1.6554, "step": 212 }, { "epoch": 19.387878787878787, "grad_norm": 0.3231726288795471, "learning_rate": 4.787e-05, "loss": 1.6793, "step": 213 }, { "epoch": 19.484848484848484, "grad_norm": 0.3180878460407257, "learning_rate": 4.7860000000000006e-05, "loss": 1.6603, "step": 214 }, { "epoch": 19.581818181818182, "grad_norm": 0.3123716711997986, "learning_rate": 4.785e-05, "loss": 1.693, "step": 215 }, { "epoch": 19.67878787878788, "grad_norm": 0.3156301975250244, "learning_rate": 4.784e-05, "loss": 1.6663, "step": 216 }, { "epoch": 19.775757575757577, "grad_norm": 0.3017955720424652, "learning_rate": 4.783e-05, "loss": 1.6787, "step": 217 }, { "epoch": 19.87272727272727, "grad_norm": 0.3095964193344116, "learning_rate": 4.7820000000000006e-05, "loss": 1.6693, "step": 218 }, { "epoch": 19.96969696969697, "grad_norm": 0.29991716146469116, "learning_rate": 4.7810000000000005e-05, "loss": 1.629, "step": 219 }, { "epoch": 20.0, "grad_norm": 0.32141411304473877, "learning_rate": 4.78e-05, "loss": 1.6009, "step": 220 }, { "epoch": 20.0, "eval_loss": 1.635511040687561, "eval_runtime": 2.1483, "eval_samples_per_second": 25.602, "eval_steps_per_second": 3.258, "step": 220 }, { "epoch": 20.096969696969698, "grad_norm": 0.30841124057769775, "learning_rate": 4.779e-05, "loss": 1.6056, "step": 221 }, { "epoch": 20.193939393939395, "grad_norm": 0.2942273020744324, "learning_rate": 4.778e-05, "loss": 1.6703, "step": 222 }, { "epoch": 20.29090909090909, "grad_norm": 0.2849220931529999, "learning_rate": 4.7770000000000005e-05, "loss": 1.6272, "step": 223 }, { "epoch": 20.387878787878787, "grad_norm": 0.2896654009819031, "learning_rate": 4.7760000000000004e-05, "loss": 1.5896, "step": 224 }, { "epoch": 20.484848484848484, "grad_norm": 0.29047784209251404, "learning_rate": 4.775e-05, "loss": 1.6155, "step": 225 }, { "epoch": 20.581818181818182, "grad_norm": 0.2764190137386322, "learning_rate": 4.774e-05, "loss": 1.6085, "step": 226 }, { "epoch": 20.67878787878788, "grad_norm": 0.28020909428596497, "learning_rate": 4.7730000000000005e-05, "loss": 1.5519, "step": 227 }, { "epoch": 20.775757575757577, "grad_norm": 0.27882450819015503, "learning_rate": 4.7720000000000004e-05, "loss": 1.615, "step": 228 }, { "epoch": 20.87272727272727, "grad_norm": 0.2711680233478546, "learning_rate": 4.771e-05, "loss": 1.6046, "step": 229 }, { "epoch": 20.96969696969697, "grad_norm": 0.28352415561676025, "learning_rate": 4.77e-05, "loss": 1.5786, "step": 230 }, { "epoch": 20.96969696969697, "eval_loss": 1.5802624225616455, "eval_runtime": 2.1482, "eval_samples_per_second": 25.603, "eval_steps_per_second": 3.259, "step": 230 }, { "epoch": 21.0, "grad_norm": 0.3100228011608124, "learning_rate": 4.769e-05, "loss": 1.5409, "step": 231 }, { "epoch": 21.096969696969698, "grad_norm": 0.2626677453517914, "learning_rate": 4.7680000000000004e-05, "loss": 1.5711, "step": 232 }, { "epoch": 21.193939393939395, "grad_norm": 0.257974237203598, "learning_rate": 4.767e-05, "loss": 1.5816, "step": 233 }, { "epoch": 21.29090909090909, "grad_norm": 0.27121230959892273, "learning_rate": 4.766000000000001e-05, "loss": 1.5381, "step": 234 }, { "epoch": 21.387878787878787, "grad_norm": 0.2541174292564392, "learning_rate": 4.765e-05, "loss": 1.5695, "step": 235 }, { "epoch": 21.484848484848484, "grad_norm": 0.25187304615974426, "learning_rate": 4.7640000000000005e-05, "loss": 1.5354, "step": 236 }, { "epoch": 21.581818181818182, "grad_norm": 0.2590962052345276, "learning_rate": 4.763e-05, "loss": 1.5459, "step": 237 }, { "epoch": 21.67878787878788, "grad_norm": 0.2600695788860321, "learning_rate": 4.762e-05, "loss": 1.5432, "step": 238 }, { "epoch": 21.775757575757577, "grad_norm": 0.2626539468765259, "learning_rate": 4.761000000000001e-05, "loss": 1.5048, "step": 239 }, { "epoch": 21.87272727272727, "grad_norm": 0.2486988604068756, "learning_rate": 4.76e-05, "loss": 1.5393, "step": 240 }, { "epoch": 21.87272727272727, "eval_loss": 1.534799337387085, "eval_runtime": 2.1443, "eval_samples_per_second": 25.649, "eval_steps_per_second": 3.264, "step": 240 }, { "epoch": 21.96969696969697, "grad_norm": 0.2648991644382477, "learning_rate": 4.7590000000000003e-05, "loss": 1.5515, "step": 241 }, { "epoch": 22.0, "grad_norm": 0.28589197993278503, "learning_rate": 4.758e-05, "loss": 1.5614, "step": 242 }, { "epoch": 22.096969696969698, "grad_norm": 0.2596450746059418, "learning_rate": 4.757e-05, "loss": 1.5398, "step": 243 }, { "epoch": 22.193939393939395, "grad_norm": 0.24288393557071686, "learning_rate": 4.7560000000000005e-05, "loss": 1.4971, "step": 244 }, { "epoch": 22.29090909090909, "grad_norm": 0.2506029009819031, "learning_rate": 4.755e-05, "loss": 1.4791, "step": 245 }, { "epoch": 22.387878787878787, "grad_norm": 0.23702731728553772, "learning_rate": 4.754e-05, "loss": 1.558, "step": 246 }, { "epoch": 22.484848484848484, "grad_norm": 0.2387530505657196, "learning_rate": 4.753e-05, "loss": 1.4622, "step": 247 }, { "epoch": 22.581818181818182, "grad_norm": 0.23143506050109863, "learning_rate": 4.7520000000000006e-05, "loss": 1.5016, "step": 248 }, { "epoch": 22.67878787878788, "grad_norm": 0.24316754937171936, "learning_rate": 4.7510000000000004e-05, "loss": 1.5126, "step": 249 }, { "epoch": 22.775757575757577, "grad_norm": 0.22900234162807465, "learning_rate": 4.75e-05, "loss": 1.477, "step": 250 }, { "epoch": 22.775757575757577, "eval_loss": 1.4960139989852905, "eval_runtime": 2.1603, "eval_samples_per_second": 25.459, "eval_steps_per_second": 3.24, "step": 250 }, { "epoch": 22.87272727272727, "grad_norm": 0.22426266968250275, "learning_rate": 4.749e-05, "loss": 1.5176, "step": 251 }, { "epoch": 22.96969696969697, "grad_norm": 0.21992135047912598, "learning_rate": 4.748e-05, "loss": 1.4845, "step": 252 }, { "epoch": 23.0, "grad_norm": 0.27798858284950256, "learning_rate": 4.7470000000000005e-05, "loss": 1.456, "step": 253 }, { "epoch": 23.096969696969698, "grad_norm": 0.23832809925079346, "learning_rate": 4.746e-05, "loss": 1.4871, "step": 254 }, { "epoch": 23.193939393939395, "grad_norm": 0.2412337064743042, "learning_rate": 4.745e-05, "loss": 1.4993, "step": 255 }, { "epoch": 23.29090909090909, "grad_norm": 0.22906090319156647, "learning_rate": 4.744e-05, "loss": 1.4443, "step": 256 }, { "epoch": 23.387878787878787, "grad_norm": 0.22417926788330078, "learning_rate": 4.7430000000000005e-05, "loss": 1.5002, "step": 257 }, { "epoch": 23.484848484848484, "grad_norm": 0.22132626175880432, "learning_rate": 4.742e-05, "loss": 1.4494, "step": 258 }, { "epoch": 23.581818181818182, "grad_norm": 0.2173018604516983, "learning_rate": 4.741e-05, "loss": 1.4401, "step": 259 }, { "epoch": 23.67878787878788, "grad_norm": 0.23397614061832428, "learning_rate": 4.74e-05, "loss": 1.4618, "step": 260 }, { "epoch": 23.67878787878788, "eval_loss": 1.462380051612854, "eval_runtime": 2.1544, "eval_samples_per_second": 25.529, "eval_steps_per_second": 3.249, "step": 260 }, { "epoch": 23.775757575757577, "grad_norm": 0.219797283411026, "learning_rate": 4.739e-05, "loss": 1.4642, "step": 261 }, { "epoch": 23.87272727272727, "grad_norm": 0.23293839395046234, "learning_rate": 4.7380000000000004e-05, "loss": 1.4228, "step": 262 }, { "epoch": 23.96969696969697, "grad_norm": 0.22269245982170105, "learning_rate": 4.737e-05, "loss": 1.4503, "step": 263 }, { "epoch": 24.0, "grad_norm": 0.2525419592857361, "learning_rate": 4.736000000000001e-05, "loss": 1.4629, "step": 264 }, { "epoch": 24.096969696969698, "grad_norm": 0.2224574089050293, "learning_rate": 4.735e-05, "loss": 1.4174, "step": 265 }, { "epoch": 24.193939393939395, "grad_norm": 0.21884828805923462, "learning_rate": 4.7340000000000004e-05, "loss": 1.436, "step": 266 }, { "epoch": 24.29090909090909, "grad_norm": 0.21463195979595184, "learning_rate": 4.733e-05, "loss": 1.431, "step": 267 }, { "epoch": 24.387878787878787, "grad_norm": 0.20648930966854095, "learning_rate": 4.732e-05, "loss": 1.4178, "step": 268 }, { "epoch": 24.484848484848484, "grad_norm": 0.2181093543767929, "learning_rate": 4.7310000000000006e-05, "loss": 1.4447, "step": 269 }, { "epoch": 24.581818181818182, "grad_norm": 0.205455020070076, "learning_rate": 4.73e-05, "loss": 1.4116, "step": 270 }, { "epoch": 24.581818181818182, "eval_loss": 1.432446002960205, "eval_runtime": 2.1882, "eval_samples_per_second": 25.135, "eval_steps_per_second": 3.199, "step": 270 }, { "epoch": 24.67878787878788, "grad_norm": 0.21049101650714874, "learning_rate": 4.729e-05, "loss": 1.4514, "step": 271 }, { "epoch": 24.775757575757577, "grad_norm": 0.2380298376083374, "learning_rate": 4.728e-05, "loss": 1.4535, "step": 272 }, { "epoch": 24.87272727272727, "grad_norm": 0.20930226147174835, "learning_rate": 4.7270000000000007e-05, "loss": 1.4293, "step": 273 }, { "epoch": 24.96969696969697, "grad_norm": 0.24193055927753448, "learning_rate": 4.7260000000000005e-05, "loss": 1.3748, "step": 274 }, { "epoch": 25.0, "grad_norm": 0.27937281131744385, "learning_rate": 4.7249999999999997e-05, "loss": 1.4503, "step": 275 }, { "epoch": 25.096969696969698, "grad_norm": 0.20097865164279938, "learning_rate": 4.724e-05, "loss": 1.3747, "step": 276 }, { "epoch": 25.193939393939395, "grad_norm": 0.20190516114234924, "learning_rate": 4.723e-05, "loss": 1.406, "step": 277 }, { "epoch": 25.29090909090909, "grad_norm": 0.2020610272884369, "learning_rate": 4.7220000000000005e-05, "loss": 1.4089, "step": 278 }, { "epoch": 25.387878787878787, "grad_norm": 0.21105410158634186, "learning_rate": 4.7210000000000004e-05, "loss": 1.3805, "step": 279 }, { "epoch": 25.484848484848484, "grad_norm": 0.22814784944057465, "learning_rate": 4.72e-05, "loss": 1.4185, "step": 280 }, { "epoch": 25.484848484848484, "eval_loss": 1.405413269996643, "eval_runtime": 2.1372, "eval_samples_per_second": 25.735, "eval_steps_per_second": 3.275, "step": 280 }, { "epoch": 25.581818181818182, "grad_norm": 0.21340583264827728, "learning_rate": 4.719e-05, "loss": 1.4186, "step": 281 }, { "epoch": 25.67878787878788, "grad_norm": 0.19968931376934052, "learning_rate": 4.718e-05, "loss": 1.4559, "step": 282 }, { "epoch": 25.775757575757577, "grad_norm": 0.23844146728515625, "learning_rate": 4.7170000000000004e-05, "loss": 1.3384, "step": 283 }, { "epoch": 25.87272727272727, "grad_norm": 0.21829138696193695, "learning_rate": 4.716e-05, "loss": 1.4191, "step": 284 }, { "epoch": 25.96969696969697, "grad_norm": 0.2160981446504593, "learning_rate": 4.715e-05, "loss": 1.378, "step": 285 }, { "epoch": 26.0, "grad_norm": 0.26061224937438965, "learning_rate": 4.714e-05, "loss": 1.2759, "step": 286 }, { "epoch": 26.096969696969698, "grad_norm": 0.20122674107551575, "learning_rate": 4.7130000000000004e-05, "loss": 1.3796, "step": 287 }, { "epoch": 26.193939393939395, "grad_norm": 0.19445091485977173, "learning_rate": 4.712e-05, "loss": 1.3533, "step": 288 }, { "epoch": 26.29090909090909, "grad_norm": 0.224958136677742, "learning_rate": 4.711e-05, "loss": 1.4088, "step": 289 }, { "epoch": 26.387878787878787, "grad_norm": 0.20992790162563324, "learning_rate": 4.71e-05, "loss": 1.3567, "step": 290 }, { "epoch": 26.387878787878787, "eval_loss": 1.379786491394043, "eval_runtime": 2.159, "eval_samples_per_second": 25.475, "eval_steps_per_second": 3.242, "step": 290 }, { "epoch": 26.484848484848484, "grad_norm": 0.20614048838615417, "learning_rate": 4.709e-05, "loss": 1.3977, "step": 291 }, { "epoch": 26.581818181818182, "grad_norm": 0.19888976216316223, "learning_rate": 4.708e-05, "loss": 1.3461, "step": 292 }, { "epoch": 26.67878787878788, "grad_norm": 0.19490261375904083, "learning_rate": 4.707e-05, "loss": 1.3935, "step": 293 }, { "epoch": 26.775757575757577, "grad_norm": 0.19813090562820435, "learning_rate": 4.706000000000001e-05, "loss": 1.3782, "step": 294 }, { "epoch": 26.87272727272727, "grad_norm": 0.23433801531791687, "learning_rate": 4.705e-05, "loss": 1.3338, "step": 295 }, { "epoch": 26.96969696969697, "grad_norm": 0.23675957322120667, "learning_rate": 4.7040000000000004e-05, "loss": 1.3322, "step": 296 }, { "epoch": 27.0, "grad_norm": 0.27610862255096436, "learning_rate": 4.703e-05, "loss": 1.352, "step": 297 }, { "epoch": 27.096969696969698, "grad_norm": 0.2032589614391327, "learning_rate": 4.702e-05, "loss": 1.3404, "step": 298 }, { "epoch": 27.193939393939395, "grad_norm": 0.19773827493190765, "learning_rate": 4.7010000000000006e-05, "loss": 1.3318, "step": 299 }, { "epoch": 27.29090909090909, "grad_norm": 0.26485851407051086, "learning_rate": 4.7e-05, "loss": 1.3848, "step": 300 }, { "epoch": 27.29090909090909, "eval_loss": 1.3541288375854492, "eval_runtime": 2.1296, "eval_samples_per_second": 25.827, "eval_steps_per_second": 3.287, "step": 300 }, { "epoch": 27.387878787878787, "grad_norm": 0.21880951523780823, "learning_rate": 4.699e-05, "loss": 1.3282, "step": 301 }, { "epoch": 27.484848484848484, "grad_norm": 0.20713409781455994, "learning_rate": 4.698e-05, "loss": 1.3377, "step": 302 }, { "epoch": 27.581818181818182, "grad_norm": 0.20707346498966217, "learning_rate": 4.6970000000000006e-05, "loss": 1.3616, "step": 303 }, { "epoch": 27.67878787878788, "grad_norm": 0.20824404060840607, "learning_rate": 4.6960000000000004e-05, "loss": 1.347, "step": 304 }, { "epoch": 27.775757575757577, "grad_norm": 0.2190723717212677, "learning_rate": 4.695e-05, "loss": 1.334, "step": 305 }, { "epoch": 27.87272727272727, "grad_norm": 0.23655390739440918, "learning_rate": 4.694e-05, "loss": 1.3241, "step": 306 }, { "epoch": 27.96969696969697, "grad_norm": 0.2129470258951187, "learning_rate": 4.693e-05, "loss": 1.2995, "step": 307 }, { "epoch": 28.0, "grad_norm": 0.2688191533088684, "learning_rate": 4.6920000000000005e-05, "loss": 1.3388, "step": 308 }, { "epoch": 28.096969696969698, "grad_norm": 0.2122860997915268, "learning_rate": 4.691e-05, "loss": 1.3415, "step": 309 }, { "epoch": 28.193939393939395, "grad_norm": 0.21898894011974335, "learning_rate": 4.69e-05, "loss": 1.3114, "step": 310 }, { "epoch": 28.193939393939395, "eval_loss": 1.328099250793457, "eval_runtime": 2.136, "eval_samples_per_second": 25.749, "eval_steps_per_second": 3.277, "step": 310 }, { "epoch": 28.29090909090909, "grad_norm": 0.21611572802066803, "learning_rate": 4.689e-05, "loss": 1.3003, "step": 311 }, { "epoch": 28.387878787878787, "grad_norm": 0.2057645171880722, "learning_rate": 4.688e-05, "loss": 1.2691, "step": 312 }, { "epoch": 28.484848484848484, "grad_norm": 0.20616444945335388, "learning_rate": 4.6870000000000004e-05, "loss": 1.2799, "step": 313 }, { "epoch": 28.581818181818182, "grad_norm": 0.21262364089488983, "learning_rate": 4.686e-05, "loss": 1.3319, "step": 314 }, { "epoch": 28.67878787878788, "grad_norm": 0.2020464539527893, "learning_rate": 4.685000000000001e-05, "loss": 1.3029, "step": 315 }, { "epoch": 28.775757575757577, "grad_norm": 0.21590180695056915, "learning_rate": 4.684e-05, "loss": 1.3115, "step": 316 }, { "epoch": 28.87272727272727, "grad_norm": 0.2041538804769516, "learning_rate": 4.6830000000000004e-05, "loss": 1.3516, "step": 317 }, { "epoch": 28.96969696969697, "grad_norm": 0.2074449360370636, "learning_rate": 4.682e-05, "loss": 1.2955, "step": 318 }, { "epoch": 29.0, "grad_norm": 0.3378038704395294, "learning_rate": 4.681e-05, "loss": 1.3059, "step": 319 }, { "epoch": 29.096969696969698, "grad_norm": 0.21807034313678741, "learning_rate": 4.6800000000000006e-05, "loss": 1.2858, "step": 320 }, { "epoch": 29.096969696969698, "eval_loss": 1.3011435270309448, "eval_runtime": 2.161, "eval_samples_per_second": 25.451, "eval_steps_per_second": 3.239, "step": 320 }, { "epoch": 29.193939393939395, "grad_norm": 0.2208559513092041, "learning_rate": 4.679e-05, "loss": 1.2667, "step": 321 }, { "epoch": 29.29090909090909, "grad_norm": 0.20598140358924866, "learning_rate": 4.678e-05, "loss": 1.3218, "step": 322 }, { "epoch": 29.387878787878787, "grad_norm": 0.23158182203769684, "learning_rate": 4.677e-05, "loss": 1.3501, "step": 323 }, { "epoch": 29.484848484848484, "grad_norm": 0.21017974615097046, "learning_rate": 4.6760000000000006e-05, "loss": 1.2647, "step": 324 }, { "epoch": 29.581818181818182, "grad_norm": 0.22472354769706726, "learning_rate": 4.6750000000000005e-05, "loss": 1.2315, "step": 325 }, { "epoch": 29.67878787878788, "grad_norm": 0.22486887872219086, "learning_rate": 4.674e-05, "loss": 1.2892, "step": 326 }, { "epoch": 29.775757575757577, "grad_norm": 0.22963334619998932, "learning_rate": 4.673e-05, "loss": 1.2826, "step": 327 }, { "epoch": 29.87272727272727, "grad_norm": 0.24132588505744934, "learning_rate": 4.672e-05, "loss": 1.2428, "step": 328 }, { "epoch": 29.96969696969697, "grad_norm": 0.2201637327671051, "learning_rate": 4.6710000000000005e-05, "loss": 1.2668, "step": 329 }, { "epoch": 30.0, "grad_norm": 0.29462799429893494, "learning_rate": 4.6700000000000003e-05, "loss": 1.2142, "step": 330 }, { "epoch": 30.0, "eval_loss": 1.2728815078735352, "eval_runtime": 2.1763, "eval_samples_per_second": 25.272, "eval_steps_per_second": 3.216, "step": 330 }, { "epoch": 30.096969696969698, "grad_norm": 0.2325703203678131, "learning_rate": 4.669e-05, "loss": 1.2378, "step": 331 }, { "epoch": 30.193939393939395, "grad_norm": 0.2217150777578354, "learning_rate": 4.668e-05, "loss": 1.274, "step": 332 }, { "epoch": 30.29090909090909, "grad_norm": 0.22642824053764343, "learning_rate": 4.6670000000000005e-05, "loss": 1.2656, "step": 333 }, { "epoch": 30.387878787878787, "grad_norm": 0.22929802536964417, "learning_rate": 4.6660000000000004e-05, "loss": 1.2234, "step": 334 }, { "epoch": 30.484848484848484, "grad_norm": 0.21472522616386414, "learning_rate": 4.665e-05, "loss": 1.2359, "step": 335 }, { "epoch": 30.581818181818182, "grad_norm": 0.21877072751522064, "learning_rate": 4.664e-05, "loss": 1.2275, "step": 336 }, { "epoch": 30.67878787878788, "grad_norm": 0.24395674467086792, "learning_rate": 4.663e-05, "loss": 1.2313, "step": 337 }, { "epoch": 30.775757575757577, "grad_norm": 0.2251306176185608, "learning_rate": 4.6620000000000004e-05, "loss": 1.2631, "step": 338 }, { "epoch": 30.87272727272727, "grad_norm": 0.2529394030570984, "learning_rate": 4.661e-05, "loss": 1.2571, "step": 339 }, { "epoch": 30.96969696969697, "grad_norm": 0.24227924644947052, "learning_rate": 4.660000000000001e-05, "loss": 1.2417, "step": 340 }, { "epoch": 30.96969696969697, "eval_loss": 1.2429355382919312, "eval_runtime": 2.175, "eval_samples_per_second": 25.287, "eval_steps_per_second": 3.218, "step": 340 }, { "epoch": 31.0, "grad_norm": 0.2832590639591217, "learning_rate": 4.659e-05, "loss": 1.2431, "step": 341 }, { "epoch": 31.096969696969698, "grad_norm": 0.23116494715213776, "learning_rate": 4.6580000000000005e-05, "loss": 1.2152, "step": 342 }, { "epoch": 31.193939393939395, "grad_norm": 0.22689251601696014, "learning_rate": 4.657e-05, "loss": 1.2302, "step": 343 }, { "epoch": 31.29090909090909, "grad_norm": 0.2684100270271301, "learning_rate": 4.656e-05, "loss": 1.1755, "step": 344 }, { "epoch": 31.387878787878787, "grad_norm": 0.2274557501077652, "learning_rate": 4.655000000000001e-05, "loss": 1.2127, "step": 345 }, { "epoch": 31.484848484848484, "grad_norm": 0.2353113293647766, "learning_rate": 4.654e-05, "loss": 1.2531, "step": 346 }, { "epoch": 31.581818181818182, "grad_norm": 0.2506870627403259, "learning_rate": 4.6530000000000003e-05, "loss": 1.1684, "step": 347 }, { "epoch": 31.67878787878788, "grad_norm": 0.25880929827690125, "learning_rate": 4.652e-05, "loss": 1.2229, "step": 348 }, { "epoch": 31.775757575757577, "grad_norm": 0.23619729280471802, "learning_rate": 4.651e-05, "loss": 1.2227, "step": 349 }, { "epoch": 31.87272727272727, "grad_norm": 0.23269791901111603, "learning_rate": 4.6500000000000005e-05, "loss": 1.2458, "step": 350 }, { "epoch": 31.87272727272727, "eval_loss": 1.211027979850769, "eval_runtime": 2.153, "eval_samples_per_second": 25.545, "eval_steps_per_second": 3.251, "step": 350 }, { "epoch": 31.96969696969697, "grad_norm": 0.22882823646068573, "learning_rate": 4.649e-05, "loss": 1.1747, "step": 351 }, { "epoch": 32.0, "grad_norm": 0.37140440940856934, "learning_rate": 4.648e-05, "loss": 1.153, "step": 352 }, { "epoch": 32.096969696969694, "grad_norm": 0.24536997079849243, "learning_rate": 4.647e-05, "loss": 1.1431, "step": 353 }, { "epoch": 32.193939393939395, "grad_norm": 0.24964717030525208, "learning_rate": 4.6460000000000006e-05, "loss": 1.188, "step": 354 }, { "epoch": 32.29090909090909, "grad_norm": 0.26268887519836426, "learning_rate": 4.6450000000000004e-05, "loss": 1.1864, "step": 355 }, { "epoch": 32.38787878787879, "grad_norm": 0.24667908251285553, "learning_rate": 4.644e-05, "loss": 1.1578, "step": 356 }, { "epoch": 32.484848484848484, "grad_norm": 0.2375129610300064, "learning_rate": 4.643e-05, "loss": 1.2097, "step": 357 }, { "epoch": 32.58181818181818, "grad_norm": 0.27233636379241943, "learning_rate": 4.642e-05, "loss": 1.134, "step": 358 }, { "epoch": 32.67878787878788, "grad_norm": 0.24921810626983643, "learning_rate": 4.6410000000000005e-05, "loss": 1.1822, "step": 359 }, { "epoch": 32.775757575757574, "grad_norm": 0.24470624327659607, "learning_rate": 4.64e-05, "loss": 1.1689, "step": 360 }, { "epoch": 32.775757575757574, "eval_loss": 1.1777660846710205, "eval_runtime": 2.1615, "eval_samples_per_second": 25.445, "eval_steps_per_second": 3.239, "step": 360 }, { "epoch": 32.872727272727275, "grad_norm": 0.23806406557559967, "learning_rate": 4.639e-05, "loss": 1.1702, "step": 361 }, { "epoch": 32.96969696969697, "grad_norm": 0.24547907710075378, "learning_rate": 4.638e-05, "loss": 1.1973, "step": 362 }, { "epoch": 33.0, "grad_norm": 0.36119315028190613, "learning_rate": 4.6370000000000005e-05, "loss": 1.1615, "step": 363 }, { "epoch": 33.096969696969694, "grad_norm": 0.2657982110977173, "learning_rate": 4.636e-05, "loss": 1.1637, "step": 364 }, { "epoch": 33.193939393939395, "grad_norm": 0.2563134729862213, "learning_rate": 4.635e-05, "loss": 1.1516, "step": 365 }, { "epoch": 33.29090909090909, "grad_norm": 0.29967230558395386, "learning_rate": 4.634e-05, "loss": 1.1549, "step": 366 }, { "epoch": 33.38787878787879, "grad_norm": 0.2566681504249573, "learning_rate": 4.633e-05, "loss": 1.1483, "step": 367 }, { "epoch": 33.484848484848484, "grad_norm": 0.24460069835186005, "learning_rate": 4.6320000000000004e-05, "loss": 1.1149, "step": 368 }, { "epoch": 33.58181818181818, "grad_norm": 0.263212114572525, "learning_rate": 4.631e-05, "loss": 1.1419, "step": 369 }, { "epoch": 33.67878787878788, "grad_norm": 0.24054227769374847, "learning_rate": 4.630000000000001e-05, "loss": 1.1073, "step": 370 }, { "epoch": 33.67878787878788, "eval_loss": 1.1430851221084595, "eval_runtime": 2.1402, "eval_samples_per_second": 25.699, "eval_steps_per_second": 3.271, "step": 370 }, { "epoch": 33.775757575757574, "grad_norm": 0.26968780159950256, "learning_rate": 4.629e-05, "loss": 1.1465, "step": 371 }, { "epoch": 33.872727272727275, "grad_norm": 0.26295608282089233, "learning_rate": 4.6280000000000004e-05, "loss": 1.1351, "step": 372 }, { "epoch": 33.96969696969697, "grad_norm": 0.2707311511039734, "learning_rate": 4.627e-05, "loss": 1.0995, "step": 373 }, { "epoch": 34.0, "grad_norm": 0.37232154607772827, "learning_rate": 4.626e-05, "loss": 1.0998, "step": 374 }, { "epoch": 34.096969696969694, "grad_norm": 0.26393061876296997, "learning_rate": 4.6250000000000006e-05, "loss": 1.1008, "step": 375 }, { "epoch": 34.193939393939395, "grad_norm": 0.25553223490715027, "learning_rate": 4.624e-05, "loss": 1.1265, "step": 376 }, { "epoch": 34.29090909090909, "grad_norm": 0.2520650625228882, "learning_rate": 4.623e-05, "loss": 1.0521, "step": 377 }, { "epoch": 34.38787878787879, "grad_norm": 0.2646835744380951, "learning_rate": 4.622e-05, "loss": 1.1236, "step": 378 }, { "epoch": 34.484848484848484, "grad_norm": 0.31807929277420044, "learning_rate": 4.6210000000000006e-05, "loss": 1.1201, "step": 379 }, { "epoch": 34.58181818181818, "grad_norm": 0.2697524428367615, "learning_rate": 4.6200000000000005e-05, "loss": 1.1187, "step": 380 }, { "epoch": 34.58181818181818, "eval_loss": 1.1067144870758057, "eval_runtime": 2.1541, "eval_samples_per_second": 25.533, "eval_steps_per_second": 3.25, "step": 380 }, { "epoch": 34.67878787878788, "grad_norm": 0.28524914383888245, "learning_rate": 4.619e-05, "loss": 1.0797, "step": 381 }, { "epoch": 34.775757575757574, "grad_norm": 0.26566869020462036, "learning_rate": 4.618e-05, "loss": 1.0689, "step": 382 }, { "epoch": 34.872727272727275, "grad_norm": 0.2833225727081299, "learning_rate": 4.617e-05, "loss": 1.0731, "step": 383 }, { "epoch": 34.96969696969697, "grad_norm": 0.2912672758102417, "learning_rate": 4.6160000000000005e-05, "loss": 1.0997, "step": 384 }, { "epoch": 35.0, "grad_norm": 0.3306087255477905, "learning_rate": 4.6150000000000004e-05, "loss": 1.0326, "step": 385 }, { "epoch": 35.096969696969694, "grad_norm": 0.2548561990261078, "learning_rate": 4.614e-05, "loss": 1.1003, "step": 386 }, { "epoch": 35.193939393939395, "grad_norm": 0.2752056121826172, "learning_rate": 4.613e-05, "loss": 1.0245, "step": 387 }, { "epoch": 35.29090909090909, "grad_norm": 0.2673061490058899, "learning_rate": 4.612e-05, "loss": 1.1164, "step": 388 }, { "epoch": 35.38787878787879, "grad_norm": 0.2884056568145752, "learning_rate": 4.6110000000000004e-05, "loss": 1.0179, "step": 389 }, { "epoch": 35.484848484848484, "grad_norm": 0.27470582723617554, "learning_rate": 4.61e-05, "loss": 1.0875, "step": 390 }, { "epoch": 35.484848484848484, "eval_loss": 1.068583369255066, "eval_runtime": 2.153, "eval_samples_per_second": 25.545, "eval_steps_per_second": 3.251, "step": 390 }, { "epoch": 35.58181818181818, "grad_norm": 0.3073423206806183, "learning_rate": 4.609e-05, "loss": 1.047, "step": 391 }, { "epoch": 35.67878787878788, "grad_norm": 0.25945353507995605, "learning_rate": 4.608e-05, "loss": 1.0403, "step": 392 }, { "epoch": 35.775757575757574, "grad_norm": 0.303960382938385, "learning_rate": 4.6070000000000004e-05, "loss": 1.1043, "step": 393 }, { "epoch": 35.872727272727275, "grad_norm": 0.26745638251304626, "learning_rate": 4.606e-05, "loss": 0.9779, "step": 394 }, { "epoch": 35.96969696969697, "grad_norm": 0.309352844953537, "learning_rate": 4.605e-05, "loss": 1.0151, "step": 395 }, { "epoch": 36.0, "grad_norm": 0.3577602505683899, "learning_rate": 4.604e-05, "loss": 1.002, "step": 396 }, { "epoch": 36.096969696969694, "grad_norm": 0.262930691242218, "learning_rate": 4.603e-05, "loss": 1.052, "step": 397 }, { "epoch": 36.193939393939395, "grad_norm": 0.2592763602733612, "learning_rate": 4.602e-05, "loss": 1.0056, "step": 398 }, { "epoch": 36.29090909090909, "grad_norm": 0.3038873076438904, "learning_rate": 4.601e-05, "loss": 1.0478, "step": 399 }, { "epoch": 36.38787878787879, "grad_norm": 0.27877938747406006, "learning_rate": 4.600000000000001e-05, "loss": 1.0019, "step": 400 }, { "epoch": 36.38787878787879, "eval_loss": 1.029101848602295, "eval_runtime": 2.1629, "eval_samples_per_second": 25.428, "eval_steps_per_second": 3.236, "step": 400 }, { "epoch": 36.484848484848484, "grad_norm": 0.29765585064888, "learning_rate": 4.599e-05, "loss": 0.968, "step": 401 }, { "epoch": 36.58181818181818, "grad_norm": 0.31292471289634705, "learning_rate": 4.5980000000000004e-05, "loss": 0.9544, "step": 402 }, { "epoch": 36.67878787878788, "grad_norm": 0.29842785000801086, "learning_rate": 4.597e-05, "loss": 1.0749, "step": 403 }, { "epoch": 36.775757575757574, "grad_norm": 0.32552585005760193, "learning_rate": 4.596e-05, "loss": 0.9925, "step": 404 }, { "epoch": 36.872727272727275, "grad_norm": 0.2806715965270996, "learning_rate": 4.5950000000000006e-05, "loss": 0.9852, "step": 405 }, { "epoch": 36.96969696969697, "grad_norm": 0.2873181402683258, "learning_rate": 4.594e-05, "loss": 0.9868, "step": 406 }, { "epoch": 37.0, "grad_norm": 0.3760148286819458, "learning_rate": 4.593e-05, "loss": 1.0563, "step": 407 }, { "epoch": 37.096969696969694, "grad_norm": 0.29149961471557617, "learning_rate": 4.592e-05, "loss": 0.9826, "step": 408 }, { "epoch": 37.193939393939395, "grad_norm": 0.3023413121700287, "learning_rate": 4.5910000000000006e-05, "loss": 0.9696, "step": 409 }, { "epoch": 37.29090909090909, "grad_norm": 0.31005623936653137, "learning_rate": 4.5900000000000004e-05, "loss": 0.9835, "step": 410 }, { "epoch": 37.29090909090909, "eval_loss": 0.9891466498374939, "eval_runtime": 2.1609, "eval_samples_per_second": 25.452, "eval_steps_per_second": 3.239, "step": 410 }, { "epoch": 37.38787878787879, "grad_norm": 0.30315306782722473, "learning_rate": 4.589e-05, "loss": 0.9656, "step": 411 }, { "epoch": 37.484848484848484, "grad_norm": 0.30635392665863037, "learning_rate": 4.588e-05, "loss": 0.9448, "step": 412 }, { "epoch": 37.58181818181818, "grad_norm": 0.2933935225009918, "learning_rate": 4.587e-05, "loss": 0.9165, "step": 413 }, { "epoch": 37.67878787878788, "grad_norm": 0.28974059224128723, "learning_rate": 4.5860000000000005e-05, "loss": 1.006, "step": 414 }, { "epoch": 37.775757575757574, "grad_norm": 0.27250662446022034, "learning_rate": 4.585e-05, "loss": 0.9674, "step": 415 }, { "epoch": 37.872727272727275, "grad_norm": 0.29652273654937744, "learning_rate": 4.584e-05, "loss": 0.9554, "step": 416 }, { "epoch": 37.96969696969697, "grad_norm": 0.3232543170452118, "learning_rate": 4.583e-05, "loss": 0.9759, "step": 417 }, { "epoch": 38.0, "grad_norm": 0.3857976794242859, "learning_rate": 4.5820000000000005e-05, "loss": 0.88, "step": 418 }, { "epoch": 38.096969696969694, "grad_norm": 0.2974799573421478, "learning_rate": 4.5810000000000004e-05, "loss": 0.9273, "step": 419 }, { "epoch": 38.193939393939395, "grad_norm": 0.30977925658226013, "learning_rate": 4.58e-05, "loss": 0.9634, "step": 420 }, { "epoch": 38.193939393939395, "eval_loss": 0.9490349292755127, "eval_runtime": 2.1484, "eval_samples_per_second": 25.6, "eval_steps_per_second": 3.258, "step": 420 }, { "epoch": 38.29090909090909, "grad_norm": 0.3215106725692749, "learning_rate": 4.579e-05, "loss": 0.9166, "step": 421 }, { "epoch": 38.38787878787879, "grad_norm": 0.3041662871837616, "learning_rate": 4.578e-05, "loss": 0.9446, "step": 422 }, { "epoch": 38.484848484848484, "grad_norm": 0.3011530637741089, "learning_rate": 4.5770000000000004e-05, "loss": 0.9306, "step": 423 }, { "epoch": 38.58181818181818, "grad_norm": 0.310140997171402, "learning_rate": 4.576e-05, "loss": 0.8723, "step": 424 }, { "epoch": 38.67878787878788, "grad_norm": 0.2920651137828827, "learning_rate": 4.575e-05, "loss": 0.9286, "step": 425 }, { "epoch": 38.775757575757574, "grad_norm": 0.2789269685745239, "learning_rate": 4.574e-05, "loss": 0.9178, "step": 426 }, { "epoch": 38.872727272727275, "grad_norm": 0.30473312735557556, "learning_rate": 4.573e-05, "loss": 0.9003, "step": 427 }, { "epoch": 38.96969696969697, "grad_norm": 0.2925727963447571, "learning_rate": 4.572e-05, "loss": 0.9238, "step": 428 }, { "epoch": 39.0, "grad_norm": 0.4265953600406647, "learning_rate": 4.571e-05, "loss": 0.8359, "step": 429 }, { "epoch": 39.096969696969694, "grad_norm": 0.3189105689525604, "learning_rate": 4.5700000000000006e-05, "loss": 0.8581, "step": 430 }, { "epoch": 39.096969696969694, "eval_loss": 0.9099050760269165, "eval_runtime": 2.1501, "eval_samples_per_second": 25.58, "eval_steps_per_second": 3.256, "step": 430 }, { "epoch": 39.193939393939395, "grad_norm": 0.29996800422668457, "learning_rate": 4.569e-05, "loss": 0.9016, "step": 431 }, { "epoch": 39.29090909090909, "grad_norm": 0.3168741464614868, "learning_rate": 4.568e-05, "loss": 0.9345, "step": 432 }, { "epoch": 39.38787878787879, "grad_norm": 0.2974991500377655, "learning_rate": 4.567e-05, "loss": 0.9088, "step": 433 }, { "epoch": 39.484848484848484, "grad_norm": 0.3686419725418091, "learning_rate": 4.566e-05, "loss": 0.8999, "step": 434 }, { "epoch": 39.58181818181818, "grad_norm": 0.3079046308994293, "learning_rate": 4.5650000000000005e-05, "loss": 0.852, "step": 435 }, { "epoch": 39.67878787878788, "grad_norm": 0.3118555247783661, "learning_rate": 4.564e-05, "loss": 0.8909, "step": 436 }, { "epoch": 39.775757575757574, "grad_norm": 0.31345677375793457, "learning_rate": 4.563e-05, "loss": 0.8366, "step": 437 }, { "epoch": 39.872727272727275, "grad_norm": 0.3082335293292999, "learning_rate": 4.562e-05, "loss": 0.8281, "step": 438 }, { "epoch": 39.96969696969697, "grad_norm": 0.3386876881122589, "learning_rate": 4.5610000000000005e-05, "loss": 0.8827, "step": 439 }, { "epoch": 40.0, "grad_norm": 0.46113550662994385, "learning_rate": 4.5600000000000004e-05, "loss": 0.8334, "step": 440 }, { "epoch": 40.0, "eval_loss": 0.8736940622329712, "eval_runtime": 2.1491, "eval_samples_per_second": 25.592, "eval_steps_per_second": 3.257, "step": 440 }, { "epoch": 40.096969696969694, "grad_norm": 0.29570069909095764, "learning_rate": 4.559e-05, "loss": 0.8461, "step": 441 }, { "epoch": 40.193939393939395, "grad_norm": 0.33492523431777954, "learning_rate": 4.558e-05, "loss": 0.8443, "step": 442 }, { "epoch": 40.29090909090909, "grad_norm": 0.29383033514022827, "learning_rate": 4.557e-05, "loss": 0.805, "step": 443 }, { "epoch": 40.38787878787879, "grad_norm": 0.3032298684120178, "learning_rate": 4.5560000000000004e-05, "loss": 0.862, "step": 444 }, { "epoch": 40.484848484848484, "grad_norm": 0.3141234815120697, "learning_rate": 4.555e-05, "loss": 0.8511, "step": 445 }, { "epoch": 40.58181818181818, "grad_norm": 0.29081517457962036, "learning_rate": 4.554000000000001e-05, "loss": 0.8437, "step": 446 }, { "epoch": 40.67878787878788, "grad_norm": 0.2950437664985657, "learning_rate": 4.553e-05, "loss": 0.8204, "step": 447 }, { "epoch": 40.775757575757574, "grad_norm": 0.3393695056438446, "learning_rate": 4.5520000000000005e-05, "loss": 0.8213, "step": 448 }, { "epoch": 40.872727272727275, "grad_norm": 0.331230491399765, "learning_rate": 4.551e-05, "loss": 0.8446, "step": 449 }, { "epoch": 40.96969696969697, "grad_norm": 0.33039379119873047, "learning_rate": 4.55e-05, "loss": 0.8527, "step": 450 }, { "epoch": 40.96969696969697, "eval_loss": 0.8403151035308838, "eval_runtime": 2.1512, "eval_samples_per_second": 25.567, "eval_steps_per_second": 3.254, "step": 450 }, { "epoch": 41.0, "grad_norm": 0.47446638345718384, "learning_rate": 4.549000000000001e-05, "loss": 0.8452, "step": 451 }, { "epoch": 41.096969696969694, "grad_norm": 0.2963954210281372, "learning_rate": 4.548e-05, "loss": 0.8243, "step": 452 }, { "epoch": 41.193939393939395, "grad_norm": 0.32614511251449585, "learning_rate": 4.5470000000000003e-05, "loss": 0.7441, "step": 453 }, { "epoch": 41.29090909090909, "grad_norm": 0.2976125478744507, "learning_rate": 4.546e-05, "loss": 0.8023, "step": 454 }, { "epoch": 41.38787878787879, "grad_norm": 0.3358374834060669, "learning_rate": 4.545000000000001e-05, "loss": 0.805, "step": 455 }, { "epoch": 41.484848484848484, "grad_norm": 0.28136196732521057, "learning_rate": 4.5440000000000005e-05, "loss": 0.7902, "step": 456 }, { "epoch": 41.58181818181818, "grad_norm": 0.28420117497444153, "learning_rate": 4.543e-05, "loss": 0.8673, "step": 457 }, { "epoch": 41.67878787878788, "grad_norm": 0.32107681035995483, "learning_rate": 4.542e-05, "loss": 0.7885, "step": 458 }, { "epoch": 41.775757575757574, "grad_norm": 0.30052316188812256, "learning_rate": 4.541e-05, "loss": 0.857, "step": 459 }, { "epoch": 41.872727272727275, "grad_norm": 0.3333911895751953, "learning_rate": 4.5400000000000006e-05, "loss": 0.7672, "step": 460 }, { "epoch": 41.872727272727275, "eval_loss": 0.8103982210159302, "eval_runtime": 2.1739, "eval_samples_per_second": 25.301, "eval_steps_per_second": 3.22, "step": 460 }, { "epoch": 41.96969696969697, "grad_norm": 0.32705608010292053, "learning_rate": 4.5390000000000004e-05, "loss": 0.8136, "step": 461 }, { "epoch": 42.0, "grad_norm": 0.3649839460849762, "learning_rate": 4.538e-05, "loss": 0.773, "step": 462 }, { "epoch": 42.096969696969694, "grad_norm": 0.313376784324646, "learning_rate": 4.537e-05, "loss": 0.7613, "step": 463 }, { "epoch": 42.193939393939395, "grad_norm": 0.3517065644264221, "learning_rate": 4.536e-05, "loss": 0.7411, "step": 464 }, { "epoch": 42.29090909090909, "grad_norm": 0.3445081114768982, "learning_rate": 4.5350000000000005e-05, "loss": 0.8118, "step": 465 }, { "epoch": 42.38787878787879, "grad_norm": 0.2820802330970764, "learning_rate": 4.534e-05, "loss": 0.7625, "step": 466 }, { "epoch": 42.484848484848484, "grad_norm": 0.31504544615745544, "learning_rate": 4.533e-05, "loss": 0.7453, "step": 467 }, { "epoch": 42.58181818181818, "grad_norm": 0.36062583327293396, "learning_rate": 4.532e-05, "loss": 0.7759, "step": 468 }, { "epoch": 42.67878787878788, "grad_norm": 0.2900105118751526, "learning_rate": 4.5310000000000005e-05, "loss": 0.7919, "step": 469 }, { "epoch": 42.775757575757574, "grad_norm": 0.2746892273426056, "learning_rate": 4.53e-05, "loss": 0.7571, "step": 470 }, { "epoch": 42.775757575757574, "eval_loss": 0.7850269079208374, "eval_runtime": 2.1528, "eval_samples_per_second": 25.549, "eval_steps_per_second": 3.252, "step": 470 }, { "epoch": 42.872727272727275, "grad_norm": 0.33745649456977844, "learning_rate": 4.529e-05, "loss": 0.7894, "step": 471 }, { "epoch": 42.96969696969697, "grad_norm": 0.36385348439216614, "learning_rate": 4.528e-05, "loss": 0.8226, "step": 472 }, { "epoch": 43.0, "grad_norm": 0.42536821961402893, "learning_rate": 4.527e-05, "loss": 0.7393, "step": 473 }, { "epoch": 43.096969696969694, "grad_norm": 0.2656398117542267, "learning_rate": 4.5260000000000004e-05, "loss": 0.7442, "step": 474 }, { "epoch": 43.193939393939395, "grad_norm": 0.28830406069755554, "learning_rate": 4.525e-05, "loss": 0.7747, "step": 475 }, { "epoch": 43.29090909090909, "grad_norm": 0.36671680212020874, "learning_rate": 4.524000000000001e-05, "loss": 0.8123, "step": 476 }, { "epoch": 43.38787878787879, "grad_norm": 0.3925282657146454, "learning_rate": 4.523e-05, "loss": 0.6966, "step": 477 }, { "epoch": 43.484848484848484, "grad_norm": 0.3425087332725525, "learning_rate": 4.5220000000000004e-05, "loss": 0.7403, "step": 478 }, { "epoch": 43.58181818181818, "grad_norm": 0.33888742327690125, "learning_rate": 4.521e-05, "loss": 0.7721, "step": 479 }, { "epoch": 43.67878787878788, "grad_norm": 0.3055487871170044, "learning_rate": 4.52e-05, "loss": 0.7711, "step": 480 }, { "epoch": 43.67878787878788, "eval_loss": 0.7622922658920288, "eval_runtime": 2.1454, "eval_samples_per_second": 25.637, "eval_steps_per_second": 3.263, "step": 480 }, { "epoch": 43.775757575757574, "grad_norm": 0.3457646667957306, "learning_rate": 4.5190000000000006e-05, "loss": 0.6743, "step": 481 }, { "epoch": 43.872727272727275, "grad_norm": 0.31191420555114746, "learning_rate": 4.518e-05, "loss": 0.7242, "step": 482 }, { "epoch": 43.96969696969697, "grad_norm": 0.2905394434928894, "learning_rate": 4.517e-05, "loss": 0.7729, "step": 483 }, { "epoch": 44.0, "grad_norm": 0.5394179821014404, "learning_rate": 4.516e-05, "loss": 0.758, "step": 484 }, { "epoch": 44.096969696969694, "grad_norm": 0.31047454476356506, "learning_rate": 4.5150000000000006e-05, "loss": 0.7289, "step": 485 }, { "epoch": 44.193939393939395, "grad_norm": 0.4167446792125702, "learning_rate": 4.5140000000000005e-05, "loss": 0.6943, "step": 486 }, { "epoch": 44.29090909090909, "grad_norm": 0.41475990414619446, "learning_rate": 4.513e-05, "loss": 0.6968, "step": 487 }, { "epoch": 44.38787878787879, "grad_norm": 0.3056599795818329, "learning_rate": 4.512e-05, "loss": 0.6853, "step": 488 }, { "epoch": 44.484848484848484, "grad_norm": 0.3095908761024475, "learning_rate": 4.511e-05, "loss": 0.6931, "step": 489 }, { "epoch": 44.58181818181818, "grad_norm": 0.44538062810897827, "learning_rate": 4.5100000000000005e-05, "loss": 0.8197, "step": 490 }, { "epoch": 44.58181818181818, "eval_loss": 0.7430852651596069, "eval_runtime": 2.1626, "eval_samples_per_second": 25.433, "eval_steps_per_second": 3.237, "step": 490 }, { "epoch": 44.67878787878788, "grad_norm": 0.37403160333633423, "learning_rate": 4.5090000000000004e-05, "loss": 0.75, "step": 491 }, { "epoch": 44.775757575757574, "grad_norm": 0.33197081089019775, "learning_rate": 4.508e-05, "loss": 0.7516, "step": 492 }, { "epoch": 44.872727272727275, "grad_norm": 0.4307508170604706, "learning_rate": 4.507e-05, "loss": 0.7692, "step": 493 }, { "epoch": 44.96969696969697, "grad_norm": 0.3568163812160492, "learning_rate": 4.506e-05, "loss": 0.7184, "step": 494 }, { "epoch": 45.0, "grad_norm": 0.5191144347190857, "learning_rate": 4.5050000000000004e-05, "loss": 0.5799, "step": 495 }, { "epoch": 45.096969696969694, "grad_norm": 0.29018500447273254, "learning_rate": 4.504e-05, "loss": 0.6881, "step": 496 }, { "epoch": 45.193939393939395, "grad_norm": 0.3260664641857147, "learning_rate": 4.503e-05, "loss": 0.6887, "step": 497 }, { "epoch": 45.29090909090909, "grad_norm": 0.33074823021888733, "learning_rate": 4.502e-05, "loss": 0.7306, "step": 498 }, { "epoch": 45.38787878787879, "grad_norm": 0.3127662241458893, "learning_rate": 4.5010000000000004e-05, "loss": 0.6712, "step": 499 }, { "epoch": 45.484848484848484, "grad_norm": 0.4097149074077606, "learning_rate": 4.5e-05, "loss": 0.7328, "step": 500 }, { "epoch": 45.484848484848484, "eval_loss": 0.725439190864563, "eval_runtime": 2.171, "eval_samples_per_second": 25.333, "eval_steps_per_second": 3.224, "step": 500 }, { "epoch": 45.58181818181818, "grad_norm": 0.4050314426422119, "learning_rate": 4.499e-05, "loss": 0.7028, "step": 501 }, { "epoch": 45.67878787878788, "grad_norm": 0.2748536467552185, "learning_rate": 4.498e-05, "loss": 0.7184, "step": 502 }, { "epoch": 45.775757575757574, "grad_norm": 0.3762427568435669, "learning_rate": 4.497e-05, "loss": 0.7085, "step": 503 }, { "epoch": 45.872727272727275, "grad_norm": 0.3729848265647888, "learning_rate": 4.496e-05, "loss": 0.7336, "step": 504 }, { "epoch": 45.96969696969697, "grad_norm": 0.38429006934165955, "learning_rate": 4.495e-05, "loss": 0.7051, "step": 505 }, { "epoch": 46.0, "grad_norm": 0.4606812298297882, "learning_rate": 4.494000000000001e-05, "loss": 0.6661, "step": 506 }, { "epoch": 46.096969696969694, "grad_norm": 0.3665672242641449, "learning_rate": 4.493e-05, "loss": 0.6646, "step": 507 }, { "epoch": 46.193939393939395, "grad_norm": 0.3234887421131134, "learning_rate": 4.4920000000000004e-05, "loss": 0.7053, "step": 508 }, { "epoch": 46.29090909090909, "grad_norm": 0.34196731448173523, "learning_rate": 4.491e-05, "loss": 0.6993, "step": 509 }, { "epoch": 46.38787878787879, "grad_norm": 0.2899550497531891, "learning_rate": 4.49e-05, "loss": 0.6612, "step": 510 }, { "epoch": 46.38787878787879, "eval_loss": 0.7102727293968201, "eval_runtime": 2.1663, "eval_samples_per_second": 25.389, "eval_steps_per_second": 3.231, "step": 510 }, { "epoch": 46.484848484848484, "grad_norm": 0.42077502608299255, "learning_rate": 4.4890000000000006e-05, "loss": 0.7378, "step": 511 }, { "epoch": 46.58181818181818, "grad_norm": 0.34420883655548096, "learning_rate": 4.488e-05, "loss": 0.7084, "step": 512 }, { "epoch": 46.67878787878788, "grad_norm": 0.31922397017478943, "learning_rate": 4.487e-05, "loss": 0.7148, "step": 513 }, { "epoch": 46.775757575757574, "grad_norm": 0.3793002665042877, "learning_rate": 4.486e-05, "loss": 0.6306, "step": 514 }, { "epoch": 46.872727272727275, "grad_norm": 0.33293092250823975, "learning_rate": 4.4850000000000006e-05, "loss": 0.717, "step": 515 }, { "epoch": 46.96969696969697, "grad_norm": 0.31802603602409363, "learning_rate": 4.4840000000000004e-05, "loss": 0.6785, "step": 516 }, { "epoch": 47.0, "grad_norm": 0.38046082854270935, "learning_rate": 4.483e-05, "loss": 0.6203, "step": 517 }, { "epoch": 47.096969696969694, "grad_norm": 0.3020094037055969, "learning_rate": 4.482e-05, "loss": 0.6657, "step": 518 }, { "epoch": 47.193939393939395, "grad_norm": 0.30682918429374695, "learning_rate": 4.481e-05, "loss": 0.6845, "step": 519 }, { "epoch": 47.29090909090909, "grad_norm": 0.3144380450248718, "learning_rate": 4.4800000000000005e-05, "loss": 0.6945, "step": 520 }, { "epoch": 47.29090909090909, "eval_loss": 0.6961836218833923, "eval_runtime": 2.1618, "eval_samples_per_second": 25.442, "eval_steps_per_second": 3.238, "step": 520 }, { "epoch": 47.38787878787879, "grad_norm": 0.33046039938926697, "learning_rate": 4.479e-05, "loss": 0.7259, "step": 521 }, { "epoch": 47.484848484848484, "grad_norm": 0.2844446897506714, "learning_rate": 4.478e-05, "loss": 0.6932, "step": 522 }, { "epoch": 47.58181818181818, "grad_norm": 0.3066650927066803, "learning_rate": 4.477e-05, "loss": 0.6395, "step": 523 }, { "epoch": 47.67878787878788, "grad_norm": 0.2580311596393585, "learning_rate": 4.4760000000000005e-05, "loss": 0.6321, "step": 524 }, { "epoch": 47.775757575757574, "grad_norm": 0.28855329751968384, "learning_rate": 4.4750000000000004e-05, "loss": 0.6919, "step": 525 }, { "epoch": 47.872727272727275, "grad_norm": 0.28060483932495117, "learning_rate": 4.474e-05, "loss": 0.6473, "step": 526 }, { "epoch": 47.96969696969697, "grad_norm": 0.3266681730747223, "learning_rate": 4.473e-05, "loss": 0.6605, "step": 527 }, { "epoch": 48.0, "grad_norm": 0.5188031792640686, "learning_rate": 4.472e-05, "loss": 0.722, "step": 528 }, { "epoch": 48.096969696969694, "grad_norm": 0.3377719223499298, "learning_rate": 4.4710000000000004e-05, "loss": 0.6797, "step": 529 }, { "epoch": 48.193939393939395, "grad_norm": 0.35471999645233154, "learning_rate": 4.47e-05, "loss": 0.6555, "step": 530 }, { "epoch": 48.193939393939395, "eval_loss": 0.6851030588150024, "eval_runtime": 2.1559, "eval_samples_per_second": 25.512, "eval_steps_per_second": 3.247, "step": 530 }, { "epoch": 48.29090909090909, "grad_norm": 0.480877548456192, "learning_rate": 4.469e-05, "loss": 0.6919, "step": 531 }, { "epoch": 48.38787878787879, "grad_norm": 0.2577911913394928, "learning_rate": 4.468e-05, "loss": 0.6644, "step": 532 }, { "epoch": 48.484848484848484, "grad_norm": 0.3358922302722931, "learning_rate": 4.467e-05, "loss": 0.6243, "step": 533 }, { "epoch": 48.58181818181818, "grad_norm": 0.42016032338142395, "learning_rate": 4.466e-05, "loss": 0.7167, "step": 534 }, { "epoch": 48.67878787878788, "grad_norm": 0.34250354766845703, "learning_rate": 4.465e-05, "loss": 0.6782, "step": 535 }, { "epoch": 48.775757575757574, "grad_norm": 0.3065449893474579, "learning_rate": 4.4640000000000006e-05, "loss": 0.6649, "step": 536 }, { "epoch": 48.872727272727275, "grad_norm": 0.23833245038986206, "learning_rate": 4.463e-05, "loss": 0.6054, "step": 537 }, { "epoch": 48.96969696969697, "grad_norm": 0.29932355880737305, "learning_rate": 4.462e-05, "loss": 0.6268, "step": 538 }, { "epoch": 49.0, "grad_norm": 0.4887162148952484, "learning_rate": 4.461e-05, "loss": 0.7053, "step": 539 }, { "epoch": 49.096969696969694, "grad_norm": 0.29336071014404297, "learning_rate": 4.46e-05, "loss": 0.7038, "step": 540 }, { "epoch": 49.096969696969694, "eval_loss": 0.6746473908424377, "eval_runtime": 2.1499, "eval_samples_per_second": 25.583, "eval_steps_per_second": 3.256, "step": 540 }, { "epoch": 49.193939393939395, "grad_norm": 0.2892014682292938, "learning_rate": 4.4590000000000005e-05, "loss": 0.6504, "step": 541 }, { "epoch": 49.29090909090909, "grad_norm": 0.30927982926368713, "learning_rate": 4.458e-05, "loss": 0.646, "step": 542 }, { "epoch": 49.38787878787879, "grad_norm": 0.2865374684333801, "learning_rate": 4.457e-05, "loss": 0.6393, "step": 543 }, { "epoch": 49.484848484848484, "grad_norm": 0.3668179512023926, "learning_rate": 4.456e-05, "loss": 0.6251, "step": 544 }, { "epoch": 49.58181818181818, "grad_norm": 0.3081008195877075, "learning_rate": 4.4550000000000005e-05, "loss": 0.6287, "step": 545 }, { "epoch": 49.67878787878788, "grad_norm": 0.30829358100891113, "learning_rate": 4.4540000000000004e-05, "loss": 0.6336, "step": 546 }, { "epoch": 49.775757575757574, "grad_norm": 0.34429988265037537, "learning_rate": 4.453e-05, "loss": 0.6214, "step": 547 }, { "epoch": 49.872727272727275, "grad_norm": 0.31597399711608887, "learning_rate": 4.452e-05, "loss": 0.629, "step": 548 }, { "epoch": 49.96969696969697, "grad_norm": 0.3678438067436218, "learning_rate": 4.451e-05, "loss": 0.7262, "step": 549 }, { "epoch": 50.0, "grad_norm": 0.4396074414253235, "learning_rate": 4.4500000000000004e-05, "loss": 0.6663, "step": 550 }, { "epoch": 50.0, "eval_loss": 0.6663048267364502, "eval_runtime": 2.1749, "eval_samples_per_second": 25.289, "eval_steps_per_second": 3.219, "step": 550 }, { "epoch": 50.096969696969694, "grad_norm": 0.32783371210098267, "learning_rate": 4.449e-05, "loss": 0.6205, "step": 551 }, { "epoch": 50.193939393939395, "grad_norm": 0.3056533634662628, "learning_rate": 4.448e-05, "loss": 0.6235, "step": 552 }, { "epoch": 50.29090909090909, "grad_norm": 0.2603963017463684, "learning_rate": 4.447e-05, "loss": 0.6332, "step": 553 }, { "epoch": 50.38787878787879, "grad_norm": 0.365771621465683, "learning_rate": 4.4460000000000005e-05, "loss": 0.667, "step": 554 }, { "epoch": 50.484848484848484, "grad_norm": 0.33529090881347656, "learning_rate": 4.445e-05, "loss": 0.6358, "step": 555 }, { "epoch": 50.58181818181818, "grad_norm": 0.3174441456794739, "learning_rate": 4.444e-05, "loss": 0.6445, "step": 556 }, { "epoch": 50.67878787878788, "grad_norm": 0.30779361724853516, "learning_rate": 4.443e-05, "loss": 0.6842, "step": 557 }, { "epoch": 50.775757575757574, "grad_norm": 0.2936500906944275, "learning_rate": 4.442e-05, "loss": 0.6634, "step": 558 }, { "epoch": 50.872727272727275, "grad_norm": 0.295730859041214, "learning_rate": 4.4410000000000003e-05, "loss": 0.6013, "step": 559 }, { "epoch": 50.96969696969697, "grad_norm": 0.2970040738582611, "learning_rate": 4.44e-05, "loss": 0.6143, "step": 560 }, { "epoch": 50.96969696969697, "eval_loss": 0.658263623714447, "eval_runtime": 2.1643, "eval_samples_per_second": 25.412, "eval_steps_per_second": 3.234, "step": 560 }, { "epoch": 51.0, "grad_norm": 0.4583488404750824, "learning_rate": 4.439000000000001e-05, "loss": 0.7217, "step": 561 }, { "epoch": 51.096969696969694, "grad_norm": 0.3247191309928894, "learning_rate": 4.438e-05, "loss": 0.6468, "step": 562 }, { "epoch": 51.193939393939395, "grad_norm": 0.2687683701515198, "learning_rate": 4.4370000000000004e-05, "loss": 0.6075, "step": 563 }, { "epoch": 51.29090909090909, "grad_norm": 0.28057968616485596, "learning_rate": 4.436e-05, "loss": 0.6772, "step": 564 }, { "epoch": 51.38787878787879, "grad_norm": 0.333745539188385, "learning_rate": 4.435e-05, "loss": 0.629, "step": 565 }, { "epoch": 51.484848484848484, "grad_norm": 0.2607054114341736, "learning_rate": 4.4340000000000006e-05, "loss": 0.6077, "step": 566 }, { "epoch": 51.58181818181818, "grad_norm": 0.3412008583545685, "learning_rate": 4.4330000000000004e-05, "loss": 0.6605, "step": 567 }, { "epoch": 51.67878787878788, "grad_norm": 0.2872512936592102, "learning_rate": 4.432e-05, "loss": 0.661, "step": 568 }, { "epoch": 51.775757575757574, "grad_norm": 0.23332110047340393, "learning_rate": 4.431e-05, "loss": 0.5995, "step": 569 }, { "epoch": 51.872727272727275, "grad_norm": 0.3014121651649475, "learning_rate": 4.43e-05, "loss": 0.5836, "step": 570 }, { "epoch": 51.872727272727275, "eval_loss": 0.6509251594543457, "eval_runtime": 2.1649, "eval_samples_per_second": 25.406, "eval_steps_per_second": 3.233, "step": 570 }, { "epoch": 51.96969696969697, "grad_norm": 0.27421829104423523, "learning_rate": 4.4290000000000005e-05, "loss": 0.6791, "step": 571 }, { "epoch": 52.0, "grad_norm": 0.4212527871131897, "learning_rate": 4.428e-05, "loss": 0.5194, "step": 572 }, { "epoch": 52.096969696969694, "grad_norm": 0.2564944326877594, "learning_rate": 4.427e-05, "loss": 0.5887, "step": 573 }, { "epoch": 52.193939393939395, "grad_norm": 0.271538645029068, "learning_rate": 4.426e-05, "loss": 0.6401, "step": 574 }, { "epoch": 52.29090909090909, "grad_norm": 0.30780258774757385, "learning_rate": 4.4250000000000005e-05, "loss": 0.676, "step": 575 }, { "epoch": 52.38787878787879, "grad_norm": 0.25906750559806824, "learning_rate": 4.424e-05, "loss": 0.6563, "step": 576 }, { "epoch": 52.484848484848484, "grad_norm": 0.3498859107494354, "learning_rate": 4.423e-05, "loss": 0.6287, "step": 577 }, { "epoch": 52.58181818181818, "grad_norm": 0.31031695008277893, "learning_rate": 4.422e-05, "loss": 0.6062, "step": 578 }, { "epoch": 52.67878787878788, "grad_norm": 0.2525545060634613, "learning_rate": 4.421e-05, "loss": 0.5684, "step": 579 }, { "epoch": 52.775757575757574, "grad_norm": 0.25607815384864807, "learning_rate": 4.4200000000000004e-05, "loss": 0.6153, "step": 580 }, { "epoch": 52.775757575757574, "eval_loss": 0.6439118385314941, "eval_runtime": 2.1484, "eval_samples_per_second": 25.6, "eval_steps_per_second": 3.258, "step": 580 }, { "epoch": 52.872727272727275, "grad_norm": 0.30004850029945374, "learning_rate": 4.419e-05, "loss": 0.6104, "step": 581 }, { "epoch": 52.96969696969697, "grad_norm": 0.27988260984420776, "learning_rate": 4.418000000000001e-05, "loss": 0.6384, "step": 582 }, { "epoch": 53.0, "grad_norm": 0.46435651183128357, "learning_rate": 4.417e-05, "loss": 0.655, "step": 583 }, { "epoch": 53.096969696969694, "grad_norm": 0.2364508956670761, "learning_rate": 4.4160000000000004e-05, "loss": 0.6002, "step": 584 }, { "epoch": 53.193939393939395, "grad_norm": 0.2596590220928192, "learning_rate": 4.415e-05, "loss": 0.6007, "step": 585 }, { "epoch": 53.29090909090909, "grad_norm": 0.32126492261886597, "learning_rate": 4.414e-05, "loss": 0.5991, "step": 586 }, { "epoch": 53.38787878787879, "grad_norm": 0.2967338263988495, "learning_rate": 4.4130000000000006e-05, "loss": 0.6089, "step": 587 }, { "epoch": 53.484848484848484, "grad_norm": 0.22388562560081482, "learning_rate": 4.412e-05, "loss": 0.6334, "step": 588 }, { "epoch": 53.58181818181818, "grad_norm": 0.34703031182289124, "learning_rate": 4.411e-05, "loss": 0.6915, "step": 589 }, { "epoch": 53.67878787878788, "grad_norm": 0.3233970105648041, "learning_rate": 4.41e-05, "loss": 0.6571, "step": 590 }, { "epoch": 53.67878787878788, "eval_loss": 0.638057291507721, "eval_runtime": 2.1659, "eval_samples_per_second": 25.394, "eval_steps_per_second": 3.232, "step": 590 }, { "epoch": 53.775757575757574, "grad_norm": 0.28742972016334534, "learning_rate": 4.4090000000000006e-05, "loss": 0.5957, "step": 591 }, { "epoch": 53.872727272727275, "grad_norm": 0.22206705808639526, "learning_rate": 4.4080000000000005e-05, "loss": 0.5216, "step": 592 }, { "epoch": 53.96969696969697, "grad_norm": 0.26034870743751526, "learning_rate": 4.407e-05, "loss": 0.6628, "step": 593 }, { "epoch": 54.0, "grad_norm": 0.5107381343841553, "learning_rate": 4.406e-05, "loss": 0.5909, "step": 594 }, { "epoch": 54.096969696969694, "grad_norm": 0.32183170318603516, "learning_rate": 4.405e-05, "loss": 0.6119, "step": 595 }, { "epoch": 54.193939393939395, "grad_norm": 0.31215551495552063, "learning_rate": 4.4040000000000005e-05, "loss": 0.6399, "step": 596 }, { "epoch": 54.29090909090909, "grad_norm": 0.2621861696243286, "learning_rate": 4.4030000000000004e-05, "loss": 0.5847, "step": 597 }, { "epoch": 54.38787878787879, "grad_norm": 0.25500035285949707, "learning_rate": 4.402e-05, "loss": 0.6267, "step": 598 }, { "epoch": 54.484848484848484, "grad_norm": 0.30199798941612244, "learning_rate": 4.401e-05, "loss": 0.589, "step": 599 }, { "epoch": 54.58181818181818, "grad_norm": 0.2709701359272003, "learning_rate": 4.4000000000000006e-05, "loss": 0.6297, "step": 600 }, { "epoch": 54.58181818181818, "eval_loss": 0.6322523355484009, "eval_runtime": 2.1489, "eval_samples_per_second": 25.594, "eval_steps_per_second": 3.257, "step": 600 }, { "epoch": 54.67878787878788, "grad_norm": 0.3060154914855957, "learning_rate": 4.3990000000000004e-05, "loss": 0.5995, "step": 601 }, { "epoch": 54.775757575757574, "grad_norm": 0.30862957239151, "learning_rate": 4.398e-05, "loss": 0.6008, "step": 602 }, { "epoch": 54.872727272727275, "grad_norm": 0.32024699449539185, "learning_rate": 4.397e-05, "loss": 0.5772, "step": 603 }, { "epoch": 54.96969696969697, "grad_norm": 0.32759636640548706, "learning_rate": 4.396e-05, "loss": 0.6523, "step": 604 }, { "epoch": 55.0, "grad_norm": 0.39472898840904236, "learning_rate": 4.3950000000000004e-05, "loss": 0.5785, "step": 605 }, { "epoch": 55.096969696969694, "grad_norm": 0.24479424953460693, "learning_rate": 4.394e-05, "loss": 0.5679, "step": 606 }, { "epoch": 55.193939393939395, "grad_norm": 0.2995283007621765, "learning_rate": 4.393e-05, "loss": 0.6243, "step": 607 }, { "epoch": 55.29090909090909, "grad_norm": 0.29041364789009094, "learning_rate": 4.392e-05, "loss": 0.6334, "step": 608 }, { "epoch": 55.38787878787879, "grad_norm": 0.31214168667793274, "learning_rate": 4.391e-05, "loss": 0.6299, "step": 609 }, { "epoch": 55.484848484848484, "grad_norm": 0.2819170653820038, "learning_rate": 4.39e-05, "loss": 0.5911, "step": 610 }, { "epoch": 55.484848484848484, "eval_loss": 0.6271098852157593, "eval_runtime": 2.1464, "eval_samples_per_second": 25.625, "eval_steps_per_second": 3.261, "step": 610 }, { "epoch": 55.58181818181818, "grad_norm": 0.27817094326019287, "learning_rate": 4.389e-05, "loss": 0.5705, "step": 611 }, { "epoch": 55.67878787878788, "grad_norm": 0.25310567021369934, "learning_rate": 4.388000000000001e-05, "loss": 0.6376, "step": 612 }, { "epoch": 55.775757575757574, "grad_norm": 0.253987580537796, "learning_rate": 4.387e-05, "loss": 0.581, "step": 613 }, { "epoch": 55.872727272727275, "grad_norm": 0.2909943759441376, "learning_rate": 4.3860000000000004e-05, "loss": 0.581, "step": 614 }, { "epoch": 55.96969696969697, "grad_norm": 0.3695921301841736, "learning_rate": 4.385e-05, "loss": 0.6361, "step": 615 }, { "epoch": 56.0, "grad_norm": 0.4382748305797577, "learning_rate": 4.384e-05, "loss": 0.5501, "step": 616 }, { "epoch": 56.096969696969694, "grad_norm": 0.29598528146743774, "learning_rate": 4.3830000000000006e-05, "loss": 0.6328, "step": 617 }, { "epoch": 56.193939393939395, "grad_norm": 0.3592718541622162, "learning_rate": 4.382e-05, "loss": 0.5913, "step": 618 }, { "epoch": 56.29090909090909, "grad_norm": 0.30886024236679077, "learning_rate": 4.381e-05, "loss": 0.6639, "step": 619 }, { "epoch": 56.38787878787879, "grad_norm": 0.32863330841064453, "learning_rate": 4.38e-05, "loss": 0.5666, "step": 620 }, { "epoch": 56.38787878787879, "eval_loss": 0.6228953003883362, "eval_runtime": 2.1556, "eval_samples_per_second": 25.515, "eval_steps_per_second": 3.247, "step": 620 }, { "epoch": 56.484848484848484, "grad_norm": 0.3019230365753174, "learning_rate": 4.3790000000000006e-05, "loss": 0.5589, "step": 621 }, { "epoch": 56.58181818181818, "grad_norm": 0.30603864789009094, "learning_rate": 4.3780000000000004e-05, "loss": 0.5846, "step": 622 }, { "epoch": 56.67878787878788, "grad_norm": 0.3238994777202606, "learning_rate": 4.377e-05, "loss": 0.5891, "step": 623 }, { "epoch": 56.775757575757574, "grad_norm": 0.2910254895687103, "learning_rate": 4.376e-05, "loss": 0.561, "step": 624 }, { "epoch": 56.872727272727275, "grad_norm": 0.28653401136398315, "learning_rate": 4.375e-05, "loss": 0.6044, "step": 625 }, { "epoch": 56.96969696969697, "grad_norm": 0.28131261467933655, "learning_rate": 4.3740000000000005e-05, "loss": 0.6116, "step": 626 }, { "epoch": 57.0, "grad_norm": 0.4516832232475281, "learning_rate": 4.373e-05, "loss": 0.6444, "step": 627 }, { "epoch": 57.096969696969694, "grad_norm": 0.24059724807739258, "learning_rate": 4.372e-05, "loss": 0.608, "step": 628 }, { "epoch": 57.193939393939395, "grad_norm": 0.30921855568885803, "learning_rate": 4.371e-05, "loss": 0.559, "step": 629 }, { "epoch": 57.29090909090909, "grad_norm": 0.3092626929283142, "learning_rate": 4.3700000000000005e-05, "loss": 0.6223, "step": 630 }, { "epoch": 57.29090909090909, "eval_loss": 0.6180606484413147, "eval_runtime": 2.1784, "eval_samples_per_second": 25.248, "eval_steps_per_second": 3.213, "step": 630 }, { "epoch": 57.38787878787879, "grad_norm": 0.2981205880641937, "learning_rate": 4.3690000000000004e-05, "loss": 0.5294, "step": 631 }, { "epoch": 57.484848484848484, "grad_norm": 0.2812468707561493, "learning_rate": 4.368e-05, "loss": 0.6457, "step": 632 }, { "epoch": 57.58181818181818, "grad_norm": 0.24225398898124695, "learning_rate": 4.367e-05, "loss": 0.6397, "step": 633 }, { "epoch": 57.67878787878788, "grad_norm": 0.2751910388469696, "learning_rate": 4.366e-05, "loss": 0.5841, "step": 634 }, { "epoch": 57.775757575757574, "grad_norm": 0.2612401843070984, "learning_rate": 4.3650000000000004e-05, "loss": 0.5602, "step": 635 }, { "epoch": 57.872727272727275, "grad_norm": 0.27459433674812317, "learning_rate": 4.364e-05, "loss": 0.6028, "step": 636 }, { "epoch": 57.96969696969697, "grad_norm": 0.27292782068252563, "learning_rate": 4.363000000000001e-05, "loss": 0.5603, "step": 637 }, { "epoch": 58.0, "grad_norm": 0.3346533179283142, "learning_rate": 4.362e-05, "loss": 0.6219, "step": 638 }, { "epoch": 58.096969696969694, "grad_norm": 0.3203403949737549, "learning_rate": 4.361e-05, "loss": 0.6183, "step": 639 }, { "epoch": 58.193939393939395, "grad_norm": 0.27702099084854126, "learning_rate": 4.36e-05, "loss": 0.5564, "step": 640 }, { "epoch": 58.193939393939395, "eval_loss": 0.6141034960746765, "eval_runtime": 2.1576, "eval_samples_per_second": 25.491, "eval_steps_per_second": 3.244, "step": 640 }, { "epoch": 58.29090909090909, "grad_norm": 0.3335529863834381, "learning_rate": 4.359e-05, "loss": 0.6158, "step": 641 }, { "epoch": 58.38787878787879, "grad_norm": 0.2782766819000244, "learning_rate": 4.3580000000000006e-05, "loss": 0.5614, "step": 642 }, { "epoch": 58.484848484848484, "grad_norm": 0.2564246356487274, "learning_rate": 4.357e-05, "loss": 0.5689, "step": 643 }, { "epoch": 58.58181818181818, "grad_norm": 0.28409498929977417, "learning_rate": 4.356e-05, "loss": 0.5503, "step": 644 }, { "epoch": 58.67878787878788, "grad_norm": 0.28188493847846985, "learning_rate": 4.355e-05, "loss": 0.5662, "step": 645 }, { "epoch": 58.775757575757574, "grad_norm": 0.36505699157714844, "learning_rate": 4.354e-05, "loss": 0.6206, "step": 646 }, { "epoch": 58.872727272727275, "grad_norm": 0.2862570881843567, "learning_rate": 4.3530000000000005e-05, "loss": 0.6148, "step": 647 }, { "epoch": 58.96969696969697, "grad_norm": 0.26443371176719666, "learning_rate": 4.352e-05, "loss": 0.5892, "step": 648 }, { "epoch": 59.0, "grad_norm": 0.4998515844345093, "learning_rate": 4.351e-05, "loss": 0.6026, "step": 649 }, { "epoch": 59.096969696969694, "grad_norm": 0.28331199288368225, "learning_rate": 4.35e-05, "loss": 0.5927, "step": 650 }, { "epoch": 59.096969696969694, "eval_loss": 0.6094768643379211, "eval_runtime": 2.1683, "eval_samples_per_second": 25.366, "eval_steps_per_second": 3.228, "step": 650 }, { "epoch": 59.193939393939395, "grad_norm": 0.34278160333633423, "learning_rate": 4.3490000000000005e-05, "loss": 0.5871, "step": 651 }, { "epoch": 59.29090909090909, "grad_norm": 0.22800803184509277, "learning_rate": 4.3480000000000004e-05, "loss": 0.5934, "step": 652 }, { "epoch": 59.38787878787879, "grad_norm": 0.2810174822807312, "learning_rate": 4.347e-05, "loss": 0.5211, "step": 653 }, { "epoch": 59.484848484848484, "grad_norm": 0.31853628158569336, "learning_rate": 4.346e-05, "loss": 0.6027, "step": 654 }, { "epoch": 59.58181818181818, "grad_norm": 0.31586185097694397, "learning_rate": 4.345e-05, "loss": 0.6238, "step": 655 }, { "epoch": 59.67878787878788, "grad_norm": 0.29778969287872314, "learning_rate": 4.3440000000000004e-05, "loss": 0.5836, "step": 656 }, { "epoch": 59.775757575757574, "grad_norm": 0.25551241636276245, "learning_rate": 4.343e-05, "loss": 0.5566, "step": 657 }, { "epoch": 59.872727272727275, "grad_norm": 0.2709314227104187, "learning_rate": 4.342e-05, "loss": 0.5779, "step": 658 }, { "epoch": 59.96969696969697, "grad_norm": 0.24351809918880463, "learning_rate": 4.341e-05, "loss": 0.5947, "step": 659 }, { "epoch": 60.0, "grad_norm": 0.4438367486000061, "learning_rate": 4.3400000000000005e-05, "loss": 0.534, "step": 660 }, { "epoch": 60.0, "eval_loss": 0.605474591255188, "eval_runtime": 2.1507, "eval_samples_per_second": 25.574, "eval_steps_per_second": 3.255, "step": 660 }, { "epoch": 60.096969696969694, "grad_norm": 0.23856693506240845, "learning_rate": 4.339e-05, "loss": 0.5635, "step": 661 }, { "epoch": 60.193939393939395, "grad_norm": 0.32172146439552307, "learning_rate": 4.338e-05, "loss": 0.6082, "step": 662 }, { "epoch": 60.29090909090909, "grad_norm": 0.26954877376556396, "learning_rate": 4.337e-05, "loss": 0.5741, "step": 663 }, { "epoch": 60.38787878787879, "grad_norm": 0.37685203552246094, "learning_rate": 4.336e-05, "loss": 0.5493, "step": 664 }, { "epoch": 60.484848484848484, "grad_norm": 0.2673247456550598, "learning_rate": 4.335e-05, "loss": 0.6096, "step": 665 }, { "epoch": 60.58181818181818, "grad_norm": 0.2932179868221283, "learning_rate": 4.334e-05, "loss": 0.5901, "step": 666 }, { "epoch": 60.67878787878788, "grad_norm": 0.2641594409942627, "learning_rate": 4.333000000000001e-05, "loss": 0.5539, "step": 667 }, { "epoch": 60.775757575757574, "grad_norm": 0.26737964153289795, "learning_rate": 4.332e-05, "loss": 0.5452, "step": 668 }, { "epoch": 60.872727272727275, "grad_norm": 0.3418692946434021, "learning_rate": 4.3310000000000004e-05, "loss": 0.632, "step": 669 }, { "epoch": 60.96969696969697, "grad_norm": 0.37048324942588806, "learning_rate": 4.33e-05, "loss": 0.5266, "step": 670 }, { "epoch": 60.96969696969697, "eval_loss": 0.6022237539291382, "eval_runtime": 2.1773, "eval_samples_per_second": 25.261, "eval_steps_per_second": 3.215, "step": 670 }, { "epoch": 61.0, "grad_norm": 0.4496045410633087, "learning_rate": 4.329e-05, "loss": 0.6288, "step": 671 }, { "epoch": 61.096969696969694, "grad_norm": 0.2302197962999344, "learning_rate": 4.3280000000000006e-05, "loss": 0.5253, "step": 672 }, { "epoch": 61.193939393939395, "grad_norm": 0.2643318474292755, "learning_rate": 4.327e-05, "loss": 0.5668, "step": 673 }, { "epoch": 61.29090909090909, "grad_norm": 0.23105840384960175, "learning_rate": 4.326e-05, "loss": 0.6022, "step": 674 }, { "epoch": 61.38787878787879, "grad_norm": 0.2893432080745697, "learning_rate": 4.325e-05, "loss": 0.5638, "step": 675 }, { "epoch": 61.484848484848484, "grad_norm": 0.37326863408088684, "learning_rate": 4.324e-05, "loss": 0.5725, "step": 676 }, { "epoch": 61.58181818181818, "grad_norm": 0.26676344871520996, "learning_rate": 4.3230000000000005e-05, "loss": 0.6257, "step": 677 }, { "epoch": 61.67878787878788, "grad_norm": 0.25675010681152344, "learning_rate": 4.3219999999999996e-05, "loss": 0.5766, "step": 678 }, { "epoch": 61.775757575757574, "grad_norm": 0.228947252035141, "learning_rate": 4.321e-05, "loss": 0.5876, "step": 679 }, { "epoch": 61.872727272727275, "grad_norm": 0.22656583786010742, "learning_rate": 4.32e-05, "loss": 0.5509, "step": 680 }, { "epoch": 61.872727272727275, "eval_loss": 0.5982839465141296, "eval_runtime": 2.1524, "eval_samples_per_second": 25.552, "eval_steps_per_second": 3.252, "step": 680 }, { "epoch": 61.96969696969697, "grad_norm": 0.2663673460483551, "learning_rate": 4.3190000000000005e-05, "loss": 0.5563, "step": 681 }, { "epoch": 62.0, "grad_norm": 0.35887378454208374, "learning_rate": 4.318e-05, "loss": 0.5491, "step": 682 }, { "epoch": 62.096969696969694, "grad_norm": 0.2571672797203064, "learning_rate": 4.317e-05, "loss": 0.5039, "step": 683 }, { "epoch": 62.193939393939395, "grad_norm": 0.2171117067337036, "learning_rate": 4.316e-05, "loss": 0.5455, "step": 684 }, { "epoch": 62.29090909090909, "grad_norm": 0.2166251391172409, "learning_rate": 4.315e-05, "loss": 0.5581, "step": 685 }, { "epoch": 62.38787878787879, "grad_norm": 0.2635202407836914, "learning_rate": 4.3140000000000004e-05, "loss": 0.5444, "step": 686 }, { "epoch": 62.484848484848484, "grad_norm": 0.28910040855407715, "learning_rate": 4.313e-05, "loss": 0.5821, "step": 687 }, { "epoch": 62.58181818181818, "grad_norm": 0.30902335047721863, "learning_rate": 4.312000000000001e-05, "loss": 0.6414, "step": 688 }, { "epoch": 62.67878787878788, "grad_norm": 0.29418280720710754, "learning_rate": 4.311e-05, "loss": 0.6108, "step": 689 }, { "epoch": 62.775757575757574, "grad_norm": 0.3029976785182953, "learning_rate": 4.3100000000000004e-05, "loss": 0.5546, "step": 690 }, { "epoch": 62.775757575757574, "eval_loss": 0.5950291752815247, "eval_runtime": 2.1501, "eval_samples_per_second": 25.58, "eval_steps_per_second": 3.256, "step": 690 }, { "epoch": 62.872727272727275, "grad_norm": 0.31214746832847595, "learning_rate": 4.309e-05, "loss": 0.5521, "step": 691 }, { "epoch": 62.96969696969697, "grad_norm": 0.2938581705093384, "learning_rate": 4.308e-05, "loss": 0.5875, "step": 692 }, { "epoch": 63.0, "grad_norm": 0.34210625290870667, "learning_rate": 4.3070000000000006e-05, "loss": 0.5625, "step": 693 }, { "epoch": 63.096969696969694, "grad_norm": 0.26499512791633606, "learning_rate": 4.306e-05, "loss": 0.5297, "step": 694 }, { "epoch": 63.193939393939395, "grad_norm": 0.24259193241596222, "learning_rate": 4.305e-05, "loss": 0.5309, "step": 695 }, { "epoch": 63.29090909090909, "grad_norm": 0.24082423746585846, "learning_rate": 4.304e-05, "loss": 0.5689, "step": 696 }, { "epoch": 63.38787878787879, "grad_norm": 0.2728228271007538, "learning_rate": 4.3030000000000006e-05, "loss": 0.5717, "step": 697 }, { "epoch": 63.484848484848484, "grad_norm": 0.2863039970397949, "learning_rate": 4.3020000000000005e-05, "loss": 0.5788, "step": 698 }, { "epoch": 63.58181818181818, "grad_norm": 0.24509799480438232, "learning_rate": 4.301e-05, "loss": 0.6085, "step": 699 }, { "epoch": 63.67878787878788, "grad_norm": 0.2451586127281189, "learning_rate": 4.3e-05, "loss": 0.551, "step": 700 }, { "epoch": 63.67878787878788, "eval_loss": 0.5922297835350037, "eval_runtime": 2.1427, "eval_samples_per_second": 25.669, "eval_steps_per_second": 3.267, "step": 700 }, { "epoch": 63.775757575757574, "grad_norm": 0.2422385960817337, "learning_rate": 4.299e-05, "loss": 0.5275, "step": 701 }, { "epoch": 63.872727272727275, "grad_norm": 0.34540262818336487, "learning_rate": 4.2980000000000005e-05, "loss": 0.5706, "step": 702 }, { "epoch": 63.96969696969697, "grad_norm": 0.19919060170650482, "learning_rate": 4.2970000000000004e-05, "loss": 0.5926, "step": 703 }, { "epoch": 64.0, "grad_norm": 0.3330257833003998, "learning_rate": 4.296e-05, "loss": 0.601, "step": 704 }, { "epoch": 64.0969696969697, "grad_norm": 0.25846561789512634, "learning_rate": 4.295e-05, "loss": 0.5277, "step": 705 }, { "epoch": 64.19393939393939, "grad_norm": 0.31724169850349426, "learning_rate": 4.2940000000000006e-05, "loss": 0.5923, "step": 706 }, { "epoch": 64.2909090909091, "grad_norm": 0.23587939143180847, "learning_rate": 4.2930000000000004e-05, "loss": 0.5633, "step": 707 }, { "epoch": 64.38787878787879, "grad_norm": 0.2762688398361206, "learning_rate": 4.292e-05, "loss": 0.5784, "step": 708 }, { "epoch": 64.48484848484848, "grad_norm": 0.23367071151733398, "learning_rate": 4.291e-05, "loss": 0.6072, "step": 709 }, { "epoch": 64.58181818181818, "grad_norm": 0.2732716500759125, "learning_rate": 4.29e-05, "loss": 0.533, "step": 710 }, { "epoch": 64.58181818181818, "eval_loss": 0.5896597504615784, "eval_runtime": 2.1619, "eval_samples_per_second": 25.441, "eval_steps_per_second": 3.238, "step": 710 }, { "epoch": 64.67878787878787, "grad_norm": 0.22921384871006012, "learning_rate": 4.2890000000000004e-05, "loss": 0.5534, "step": 711 }, { "epoch": 64.77575757575758, "grad_norm": 0.27668488025665283, "learning_rate": 4.288e-05, "loss": 0.5378, "step": 712 }, { "epoch": 64.87272727272727, "grad_norm": 0.2813219428062439, "learning_rate": 4.287000000000001e-05, "loss": 0.5711, "step": 713 }, { "epoch": 64.96969696969697, "grad_norm": 0.25303152203559875, "learning_rate": 4.286e-05, "loss": 0.5697, "step": 714 }, { "epoch": 65.0, "grad_norm": 0.47778254747390747, "learning_rate": 4.285e-05, "loss": 0.4676, "step": 715 }, { "epoch": 65.0969696969697, "grad_norm": 0.24541884660720825, "learning_rate": 4.284e-05, "loss": 0.6171, "step": 716 }, { "epoch": 65.19393939393939, "grad_norm": 0.2891412079334259, "learning_rate": 4.283e-05, "loss": 0.5985, "step": 717 }, { "epoch": 65.2909090909091, "grad_norm": 0.2899450957775116, "learning_rate": 4.282000000000001e-05, "loss": 0.5589, "step": 718 }, { "epoch": 65.38787878787879, "grad_norm": 0.2739652395248413, "learning_rate": 4.281e-05, "loss": 0.5468, "step": 719 }, { "epoch": 65.48484848484848, "grad_norm": 0.24000035226345062, "learning_rate": 4.2800000000000004e-05, "loss": 0.5106, "step": 720 }, { "epoch": 65.48484848484848, "eval_loss": 0.5866174697875977, "eval_runtime": 2.1673, "eval_samples_per_second": 25.377, "eval_steps_per_second": 3.23, "step": 720 }, { "epoch": 65.58181818181818, "grad_norm": 0.2736685872077942, "learning_rate": 4.279e-05, "loss": 0.5149, "step": 721 }, { "epoch": 65.67878787878787, "grad_norm": 0.3003578782081604, "learning_rate": 4.278e-05, "loss": 0.567, "step": 722 }, { "epoch": 65.77575757575758, "grad_norm": 0.2917529046535492, "learning_rate": 4.2770000000000006e-05, "loss": 0.571, "step": 723 }, { "epoch": 65.87272727272727, "grad_norm": 0.20763424038887024, "learning_rate": 4.276e-05, "loss": 0.5544, "step": 724 }, { "epoch": 65.96969696969697, "grad_norm": 0.214829683303833, "learning_rate": 4.275e-05, "loss": 0.5496, "step": 725 }, { "epoch": 66.0, "grad_norm": 0.39429301023483276, "learning_rate": 4.274e-05, "loss": 0.496, "step": 726 }, { "epoch": 66.0969696969697, "grad_norm": 0.24492646753787994, "learning_rate": 4.2730000000000006e-05, "loss": 0.5959, "step": 727 }, { "epoch": 66.19393939393939, "grad_norm": 0.27110999822616577, "learning_rate": 4.2720000000000004e-05, "loss": 0.5544, "step": 728 }, { "epoch": 66.2909090909091, "grad_norm": 0.21391001343727112, "learning_rate": 4.271e-05, "loss": 0.4939, "step": 729 }, { "epoch": 66.38787878787879, "grad_norm": 0.2673359215259552, "learning_rate": 4.27e-05, "loss": 0.619, "step": 730 }, { "epoch": 66.38787878787879, "eval_loss": 0.5843937993049622, "eval_runtime": 2.1541, "eval_samples_per_second": 25.533, "eval_steps_per_second": 3.25, "step": 730 }, { "epoch": 66.48484848484848, "grad_norm": 0.34252795577049255, "learning_rate": 4.269e-05, "loss": 0.5253, "step": 731 }, { "epoch": 66.58181818181818, "grad_norm": 0.21049198508262634, "learning_rate": 4.2680000000000005e-05, "loss": 0.5423, "step": 732 }, { "epoch": 66.67878787878787, "grad_norm": 0.213708758354187, "learning_rate": 4.267e-05, "loss": 0.5479, "step": 733 }, { "epoch": 66.77575757575758, "grad_norm": 0.2658591568470001, "learning_rate": 4.266e-05, "loss": 0.5763, "step": 734 }, { "epoch": 66.87272727272727, "grad_norm": 0.2451457381248474, "learning_rate": 4.265e-05, "loss": 0.5478, "step": 735 }, { "epoch": 66.96969696969697, "grad_norm": 0.28571853041648865, "learning_rate": 4.2640000000000005e-05, "loss": 0.5563, "step": 736 }, { "epoch": 67.0, "grad_norm": 0.4764930009841919, "learning_rate": 4.2630000000000004e-05, "loss": 0.4799, "step": 737 }, { "epoch": 67.0969696969697, "grad_norm": 0.21275725960731506, "learning_rate": 4.262e-05, "loss": 0.563, "step": 738 }, { "epoch": 67.19393939393939, "grad_norm": 0.24314363300800323, "learning_rate": 4.261e-05, "loss": 0.5669, "step": 739 }, { "epoch": 67.2909090909091, "grad_norm": 0.23035164177417755, "learning_rate": 4.26e-05, "loss": 0.5501, "step": 740 }, { "epoch": 67.2909090909091, "eval_loss": 0.5817498564720154, "eval_runtime": 2.1583, "eval_samples_per_second": 25.483, "eval_steps_per_second": 3.243, "step": 740 }, { "epoch": 67.38787878787879, "grad_norm": 0.2700410485267639, "learning_rate": 4.2590000000000004e-05, "loss": 0.5613, "step": 741 }, { "epoch": 67.48484848484848, "grad_norm": 0.35602664947509766, "learning_rate": 4.258e-05, "loss": 0.5576, "step": 742 }, { "epoch": 67.58181818181818, "grad_norm": 0.26432085037231445, "learning_rate": 4.257000000000001e-05, "loss": 0.5495, "step": 743 }, { "epoch": 67.67878787878787, "grad_norm": 0.2376573383808136, "learning_rate": 4.256e-05, "loss": 0.5318, "step": 744 }, { "epoch": 67.77575757575758, "grad_norm": 0.3016679883003235, "learning_rate": 4.2550000000000004e-05, "loss": 0.5655, "step": 745 }, { "epoch": 67.87272727272727, "grad_norm": 0.2729600965976715, "learning_rate": 4.254e-05, "loss": 0.4898, "step": 746 }, { "epoch": 67.96969696969697, "grad_norm": 0.3013279139995575, "learning_rate": 4.253e-05, "loss": 0.5625, "step": 747 }, { "epoch": 68.0, "grad_norm": 0.4911724030971527, "learning_rate": 4.2520000000000006e-05, "loss": 0.5731, "step": 748 }, { "epoch": 68.0969696969697, "grad_norm": 0.23142361640930176, "learning_rate": 4.251e-05, "loss": 0.5799, "step": 749 }, { "epoch": 68.19393939393939, "grad_norm": 0.22808052599430084, "learning_rate": 4.25e-05, "loss": 0.5393, "step": 750 }, { "epoch": 68.19393939393939, "eval_loss": 0.5790581703186035, "eval_runtime": 2.17, "eval_samples_per_second": 25.346, "eval_steps_per_second": 3.226, "step": 750 }, { "epoch": 68.2909090909091, "grad_norm": 0.23808452486991882, "learning_rate": 4.249e-05, "loss": 0.5646, "step": 751 }, { "epoch": 68.38787878787879, "grad_norm": 0.2526884973049164, "learning_rate": 4.248e-05, "loss": 0.5756, "step": 752 }, { "epoch": 68.48484848484848, "grad_norm": 0.2327769547700882, "learning_rate": 4.2470000000000005e-05, "loss": 0.5109, "step": 753 }, { "epoch": 68.58181818181818, "grad_norm": 0.2696152627468109, "learning_rate": 4.246e-05, "loss": 0.5549, "step": 754 }, { "epoch": 68.67878787878787, "grad_norm": 0.33442422747612, "learning_rate": 4.245e-05, "loss": 0.5413, "step": 755 }, { "epoch": 68.77575757575758, "grad_norm": 0.23822557926177979, "learning_rate": 4.244e-05, "loss": 0.5018, "step": 756 }, { "epoch": 68.87272727272727, "grad_norm": 0.26610106229782104, "learning_rate": 4.2430000000000005e-05, "loss": 0.5691, "step": 757 }, { "epoch": 68.96969696969697, "grad_norm": 0.3304058909416199, "learning_rate": 4.2420000000000004e-05, "loss": 0.5273, "step": 758 }, { "epoch": 69.0, "grad_norm": 0.485503613948822, "learning_rate": 4.241e-05, "loss": 0.568, "step": 759 }, { "epoch": 69.0969696969697, "grad_norm": 0.20574645698070526, "learning_rate": 4.24e-05, "loss": 0.5653, "step": 760 }, { "epoch": 69.0969696969697, "eval_loss": 0.5764835476875305, "eval_runtime": 2.1761, "eval_samples_per_second": 25.275, "eval_steps_per_second": 3.217, "step": 760 }, { "epoch": 69.19393939393939, "grad_norm": 0.2699494957923889, "learning_rate": 4.239e-05, "loss": 0.516, "step": 761 }, { "epoch": 69.2909090909091, "grad_norm": 0.24947266280651093, "learning_rate": 4.2380000000000004e-05, "loss": 0.5476, "step": 762 }, { "epoch": 69.38787878787879, "grad_norm": 0.2272285521030426, "learning_rate": 4.237e-05, "loss": 0.5626, "step": 763 }, { "epoch": 69.48484848484848, "grad_norm": 0.24354349076747894, "learning_rate": 4.236e-05, "loss": 0.5629, "step": 764 }, { "epoch": 69.58181818181818, "grad_norm": 0.27001243829727173, "learning_rate": 4.235e-05, "loss": 0.5477, "step": 765 }, { "epoch": 69.67878787878787, "grad_norm": 0.23391012847423553, "learning_rate": 4.2340000000000005e-05, "loss": 0.5328, "step": 766 }, { "epoch": 69.77575757575758, "grad_norm": 0.24374918639659882, "learning_rate": 4.233e-05, "loss": 0.5185, "step": 767 }, { "epoch": 69.87272727272727, "grad_norm": 0.2527277171611786, "learning_rate": 4.232e-05, "loss": 0.5759, "step": 768 }, { "epoch": 69.96969696969697, "grad_norm": 0.26951363682746887, "learning_rate": 4.231e-05, "loss": 0.5184, "step": 769 }, { "epoch": 70.0, "grad_norm": 0.281485378742218, "learning_rate": 4.23e-05, "loss": 0.5212, "step": 770 }, { "epoch": 70.0, "eval_loss": 0.5742422342300415, "eval_runtime": 2.1603, "eval_samples_per_second": 25.46, "eval_steps_per_second": 3.24, "step": 770 }, { "epoch": 70.0969696969697, "grad_norm": 0.2688864469528198, "learning_rate": 4.229e-05, "loss": 0.5516, "step": 771 }, { "epoch": 70.19393939393939, "grad_norm": 0.2755700349807739, "learning_rate": 4.228e-05, "loss": 0.512, "step": 772 }, { "epoch": 70.2909090909091, "grad_norm": 0.2601207196712494, "learning_rate": 4.227000000000001e-05, "loss": 0.5696, "step": 773 }, { "epoch": 70.38787878787879, "grad_norm": 0.25220006704330444, "learning_rate": 4.226e-05, "loss": 0.5386, "step": 774 }, { "epoch": 70.48484848484848, "grad_norm": 0.31067171692848206, "learning_rate": 4.2250000000000004e-05, "loss": 0.5292, "step": 775 }, { "epoch": 70.58181818181818, "grad_norm": 0.30006277561187744, "learning_rate": 4.224e-05, "loss": 0.5568, "step": 776 }, { "epoch": 70.67878787878787, "grad_norm": 0.2238987237215042, "learning_rate": 4.223e-05, "loss": 0.5705, "step": 777 }, { "epoch": 70.77575757575758, "grad_norm": 0.25496816635131836, "learning_rate": 4.2220000000000006e-05, "loss": 0.5572, "step": 778 }, { "epoch": 70.87272727272727, "grad_norm": 0.21116836369037628, "learning_rate": 4.221e-05, "loss": 0.5138, "step": 779 }, { "epoch": 70.96969696969697, "grad_norm": 0.25126564502716064, "learning_rate": 4.22e-05, "loss": 0.5241, "step": 780 }, { "epoch": 70.96969696969697, "eval_loss": 0.5723573565483093, "eval_runtime": 2.1754, "eval_samples_per_second": 25.283, "eval_steps_per_second": 3.218, "step": 780 }, { "epoch": 71.0, "grad_norm": 0.36982303857803345, "learning_rate": 4.219e-05, "loss": 0.5135, "step": 781 }, { "epoch": 71.0969696969697, "grad_norm": 0.2597285211086273, "learning_rate": 4.2180000000000006e-05, "loss": 0.5315, "step": 782 }, { "epoch": 71.19393939393939, "grad_norm": 0.2919541895389557, "learning_rate": 4.2170000000000005e-05, "loss": 0.5667, "step": 783 }, { "epoch": 71.2909090909091, "grad_norm": 0.26001623272895813, "learning_rate": 4.2159999999999996e-05, "loss": 0.5545, "step": 784 }, { "epoch": 71.38787878787879, "grad_norm": 0.25698697566986084, "learning_rate": 4.215e-05, "loss": 0.5441, "step": 785 }, { "epoch": 71.48484848484848, "grad_norm": 0.2391366958618164, "learning_rate": 4.214e-05, "loss": 0.4832, "step": 786 }, { "epoch": 71.58181818181818, "grad_norm": 0.191857248544693, "learning_rate": 4.2130000000000005e-05, "loss": 0.546, "step": 787 }, { "epoch": 71.67878787878787, "grad_norm": 0.27090173959732056, "learning_rate": 4.212e-05, "loss": 0.5516, "step": 788 }, { "epoch": 71.77575757575758, "grad_norm": 0.2738029956817627, "learning_rate": 4.211e-05, "loss": 0.5072, "step": 789 }, { "epoch": 71.87272727272727, "grad_norm": 0.320273220539093, "learning_rate": 4.21e-05, "loss": 0.5563, "step": 790 }, { "epoch": 71.87272727272727, "eval_loss": 0.5696360468864441, "eval_runtime": 2.2051, "eval_samples_per_second": 24.942, "eval_steps_per_second": 3.174, "step": 790 }, { "epoch": 71.96969696969697, "grad_norm": 0.26015132665634155, "learning_rate": 4.209e-05, "loss": 0.5632, "step": 791 }, { "epoch": 72.0, "grad_norm": 0.41450807452201843, "learning_rate": 4.2080000000000004e-05, "loss": 0.4652, "step": 792 }, { "epoch": 72.0969696969697, "grad_norm": 0.24714349210262299, "learning_rate": 4.207e-05, "loss": 0.5923, "step": 793 }, { "epoch": 72.19393939393939, "grad_norm": 0.2587253451347351, "learning_rate": 4.206e-05, "loss": 0.5267, "step": 794 }, { "epoch": 72.2909090909091, "grad_norm": 0.23800162971019745, "learning_rate": 4.205e-05, "loss": 0.5081, "step": 795 }, { "epoch": 72.38787878787879, "grad_norm": 0.2461039423942566, "learning_rate": 4.2040000000000004e-05, "loss": 0.5792, "step": 796 }, { "epoch": 72.48484848484848, "grad_norm": 0.2287796139717102, "learning_rate": 4.203e-05, "loss": 0.554, "step": 797 }, { "epoch": 72.58181818181818, "grad_norm": 0.24021528661251068, "learning_rate": 4.202e-05, "loss": 0.4999, "step": 798 }, { "epoch": 72.67878787878787, "grad_norm": 0.2695189416408539, "learning_rate": 4.201e-05, "loss": 0.5849, "step": 799 }, { "epoch": 72.77575757575758, "grad_norm": 0.21474483609199524, "learning_rate": 4.2e-05, "loss": 0.5188, "step": 800 }, { "epoch": 72.77575757575758, "eval_loss": 0.567756175994873, "eval_runtime": 2.1668, "eval_samples_per_second": 25.383, "eval_steps_per_second": 3.231, "step": 800 }, { "epoch": 72.87272727272727, "grad_norm": 0.3127596378326416, "learning_rate": 4.199e-05, "loss": 0.5497, "step": 801 }, { "epoch": 72.96969696969697, "grad_norm": 0.2873506546020508, "learning_rate": 4.198e-05, "loss": 0.481, "step": 802 }, { "epoch": 73.0, "grad_norm": 0.3801302909851074, "learning_rate": 4.1970000000000006e-05, "loss": 0.4043, "step": 803 }, { "epoch": 73.0969696969697, "grad_norm": 0.3030167818069458, "learning_rate": 4.196e-05, "loss": 0.4762, "step": 804 }, { "epoch": 73.19393939393939, "grad_norm": 0.2462480664253235, "learning_rate": 4.195e-05, "loss": 0.5248, "step": 805 }, { "epoch": 73.2909090909091, "grad_norm": 0.23730331659317017, "learning_rate": 4.194e-05, "loss": 0.5396, "step": 806 }, { "epoch": 73.38787878787879, "grad_norm": 0.2629089057445526, "learning_rate": 4.193e-05, "loss": 0.5263, "step": 807 }, { "epoch": 73.48484848484848, "grad_norm": 0.27913814783096313, "learning_rate": 4.1920000000000005e-05, "loss": 0.5425, "step": 808 }, { "epoch": 73.58181818181818, "grad_norm": 0.289802610874176, "learning_rate": 4.191e-05, "loss": 0.5229, "step": 809 }, { "epoch": 73.67878787878787, "grad_norm": 0.31583303213119507, "learning_rate": 4.19e-05, "loss": 0.5544, "step": 810 }, { "epoch": 73.67878787878787, "eval_loss": 0.5661746859550476, "eval_runtime": 2.1844, "eval_samples_per_second": 25.178, "eval_steps_per_second": 3.204, "step": 810 }, { "epoch": 73.77575757575758, "grad_norm": 0.256304532289505, "learning_rate": 4.189e-05, "loss": 0.5881, "step": 811 }, { "epoch": 73.87272727272727, "grad_norm": 0.2496311515569687, "learning_rate": 4.1880000000000006e-05, "loss": 0.5106, "step": 812 }, { "epoch": 73.96969696969697, "grad_norm": 0.24245764315128326, "learning_rate": 4.1870000000000004e-05, "loss": 0.5596, "step": 813 }, { "epoch": 74.0, "grad_norm": 0.4063602089881897, "learning_rate": 4.186e-05, "loss": 0.4897, "step": 814 }, { "epoch": 74.0969696969697, "grad_norm": 0.2712928056716919, "learning_rate": 4.185e-05, "loss": 0.581, "step": 815 }, { "epoch": 74.19393939393939, "grad_norm": 0.25334519147872925, "learning_rate": 4.184e-05, "loss": 0.5512, "step": 816 }, { "epoch": 74.2909090909091, "grad_norm": 0.23247885704040527, "learning_rate": 4.1830000000000004e-05, "loss": 0.5292, "step": 817 }, { "epoch": 74.38787878787879, "grad_norm": 0.2378424108028412, "learning_rate": 4.182e-05, "loss": 0.511, "step": 818 }, { "epoch": 74.48484848484848, "grad_norm": 0.27492162585258484, "learning_rate": 4.181000000000001e-05, "loss": 0.5589, "step": 819 }, { "epoch": 74.58181818181818, "grad_norm": 0.24575957655906677, "learning_rate": 4.18e-05, "loss": 0.5062, "step": 820 }, { "epoch": 74.58181818181818, "eval_loss": 0.5639890432357788, "eval_runtime": 2.1559, "eval_samples_per_second": 25.512, "eval_steps_per_second": 3.247, "step": 820 }, { "epoch": 74.67878787878787, "grad_norm": 0.19967587292194366, "learning_rate": 4.179e-05, "loss": 0.5208, "step": 821 }, { "epoch": 74.77575757575758, "grad_norm": 0.22234216332435608, "learning_rate": 4.178e-05, "loss": 0.5334, "step": 822 }, { "epoch": 74.87272727272727, "grad_norm": 0.25841695070266724, "learning_rate": 4.177e-05, "loss": 0.5316, "step": 823 }, { "epoch": 74.96969696969697, "grad_norm": 0.21328817307949066, "learning_rate": 4.176000000000001e-05, "loss": 0.5106, "step": 824 }, { "epoch": 75.0, "grad_norm": 0.4093509316444397, "learning_rate": 4.175e-05, "loss": 0.4296, "step": 825 }, { "epoch": 75.0969696969697, "grad_norm": 0.26131540536880493, "learning_rate": 4.1740000000000004e-05, "loss": 0.5628, "step": 826 }, { "epoch": 75.19393939393939, "grad_norm": 0.28853708505630493, "learning_rate": 4.173e-05, "loss": 0.5622, "step": 827 }, { "epoch": 75.2909090909091, "grad_norm": 0.2535465657711029, "learning_rate": 4.172e-05, "loss": 0.5123, "step": 828 }, { "epoch": 75.38787878787879, "grad_norm": 0.2491854429244995, "learning_rate": 4.1710000000000006e-05, "loss": 0.5434, "step": 829 }, { "epoch": 75.48484848484848, "grad_norm": 0.24380576610565186, "learning_rate": 4.17e-05, "loss": 0.5345, "step": 830 }, { "epoch": 75.48484848484848, "eval_loss": 0.5620233416557312, "eval_runtime": 2.1695, "eval_samples_per_second": 25.352, "eval_steps_per_second": 3.227, "step": 830 }, { "epoch": 75.58181818181818, "grad_norm": 0.22149287164211273, "learning_rate": 4.169e-05, "loss": 0.5565, "step": 831 }, { "epoch": 75.67878787878787, "grad_norm": 0.2667548954486847, "learning_rate": 4.168e-05, "loss": 0.4894, "step": 832 }, { "epoch": 75.77575757575758, "grad_norm": 0.28772029280662537, "learning_rate": 4.1670000000000006e-05, "loss": 0.4504, "step": 833 }, { "epoch": 75.87272727272727, "grad_norm": 0.1839408576488495, "learning_rate": 4.1660000000000004e-05, "loss": 0.4976, "step": 834 }, { "epoch": 75.96969696969697, "grad_norm": 0.2373735010623932, "learning_rate": 4.165e-05, "loss": 0.5722, "step": 835 }, { "epoch": 76.0, "grad_norm": 0.37163248658180237, "learning_rate": 4.164e-05, "loss": 0.5103, "step": 836 }, { "epoch": 76.0969696969697, "grad_norm": 0.25069040060043335, "learning_rate": 4.163e-05, "loss": 0.5518, "step": 837 }, { "epoch": 76.19393939393939, "grad_norm": 0.2810257077217102, "learning_rate": 4.1620000000000005e-05, "loss": 0.5277, "step": 838 }, { "epoch": 76.2909090909091, "grad_norm": 0.24961601197719574, "learning_rate": 4.161e-05, "loss": 0.5557, "step": 839 }, { "epoch": 76.38787878787879, "grad_norm": 0.18921363353729248, "learning_rate": 4.16e-05, "loss": 0.5246, "step": 840 }, { "epoch": 76.38787878787879, "eval_loss": 0.5598552823066711, "eval_runtime": 2.1634, "eval_samples_per_second": 25.422, "eval_steps_per_second": 3.236, "step": 840 }, { "epoch": 76.48484848484848, "grad_norm": 0.20288659632205963, "learning_rate": 4.159e-05, "loss": 0.5274, "step": 841 }, { "epoch": 76.58181818181818, "grad_norm": 0.20671656727790833, "learning_rate": 4.1580000000000005e-05, "loss": 0.5385, "step": 842 }, { "epoch": 76.67878787878787, "grad_norm": 0.3161730468273163, "learning_rate": 4.1570000000000003e-05, "loss": 0.4516, "step": 843 }, { "epoch": 76.77575757575758, "grad_norm": 0.18657682836055756, "learning_rate": 4.156e-05, "loss": 0.5144, "step": 844 }, { "epoch": 76.87272727272727, "grad_norm": 0.22428153455257416, "learning_rate": 4.155e-05, "loss": 0.5353, "step": 845 }, { "epoch": 76.96969696969697, "grad_norm": 0.269551157951355, "learning_rate": 4.154e-05, "loss": 0.5185, "step": 846 }, { "epoch": 77.0, "grad_norm": 0.4164291322231293, "learning_rate": 4.1530000000000004e-05, "loss": 0.5249, "step": 847 }, { "epoch": 77.0969696969697, "grad_norm": 0.24406081438064575, "learning_rate": 4.152e-05, "loss": 0.5158, "step": 848 }, { "epoch": 77.19393939393939, "grad_norm": 0.25519075989723206, "learning_rate": 4.151000000000001e-05, "loss": 0.5357, "step": 849 }, { "epoch": 77.2909090909091, "grad_norm": 0.20665471255779266, "learning_rate": 4.15e-05, "loss": 0.5698, "step": 850 }, { "epoch": 77.2909090909091, "eval_loss": 0.5581657886505127, "eval_runtime": 2.1723, "eval_samples_per_second": 25.318, "eval_steps_per_second": 3.222, "step": 850 }, { "epoch": 77.38787878787879, "grad_norm": 0.2132198065519333, "learning_rate": 4.1490000000000004e-05, "loss": 0.5195, "step": 851 }, { "epoch": 77.48484848484848, "grad_norm": 0.32972055673599243, "learning_rate": 4.148e-05, "loss": 0.459, "step": 852 }, { "epoch": 77.58181818181818, "grad_norm": 0.23783457279205322, "learning_rate": 4.147e-05, "loss": 0.5747, "step": 853 }, { "epoch": 77.67878787878787, "grad_norm": 0.2641775906085968, "learning_rate": 4.1460000000000006e-05, "loss": 0.5261, "step": 854 }, { "epoch": 77.77575757575758, "grad_norm": 0.22427310049533844, "learning_rate": 4.145e-05, "loss": 0.5283, "step": 855 }, { "epoch": 77.87272727272727, "grad_norm": 0.2677212059497833, "learning_rate": 4.144e-05, "loss": 0.5173, "step": 856 }, { "epoch": 77.96969696969697, "grad_norm": 0.23267583549022675, "learning_rate": 4.143e-05, "loss": 0.5134, "step": 857 }, { "epoch": 78.0, "grad_norm": 0.36065569519996643, "learning_rate": 4.142000000000001e-05, "loss": 0.4203, "step": 858 }, { "epoch": 78.0969696969697, "grad_norm": 0.22041353583335876, "learning_rate": 4.1410000000000005e-05, "loss": 0.5188, "step": 859 }, { "epoch": 78.19393939393939, "grad_norm": 0.2389429658651352, "learning_rate": 4.14e-05, "loss": 0.5348, "step": 860 }, { "epoch": 78.19393939393939, "eval_loss": 0.5565226078033447, "eval_runtime": 2.1747, "eval_samples_per_second": 25.291, "eval_steps_per_second": 3.219, "step": 860 }, { "epoch": 78.2909090909091, "grad_norm": 0.22565747797489166, "learning_rate": 4.139e-05, "loss": 0.5359, "step": 861 }, { "epoch": 78.38787878787879, "grad_norm": 0.20705768465995789, "learning_rate": 4.138e-05, "loss": 0.5281, "step": 862 }, { "epoch": 78.48484848484848, "grad_norm": 0.2351296991109848, "learning_rate": 4.1370000000000005e-05, "loss": 0.4949, "step": 863 }, { "epoch": 78.58181818181818, "grad_norm": 0.2614632248878479, "learning_rate": 4.1360000000000004e-05, "loss": 0.5063, "step": 864 }, { "epoch": 78.67878787878787, "grad_norm": 0.21661315858364105, "learning_rate": 4.135e-05, "loss": 0.5488, "step": 865 }, { "epoch": 78.77575757575758, "grad_norm": 0.21293285489082336, "learning_rate": 4.134e-05, "loss": 0.4951, "step": 866 }, { "epoch": 78.87272727272727, "grad_norm": 0.20552313327789307, "learning_rate": 4.133e-05, "loss": 0.5106, "step": 867 }, { "epoch": 78.96969696969697, "grad_norm": 0.23161394894123077, "learning_rate": 4.1320000000000004e-05, "loss": 0.5271, "step": 868 }, { "epoch": 79.0, "grad_norm": 0.36716896295547485, "learning_rate": 4.131e-05, "loss": 0.5289, "step": 869 }, { "epoch": 79.0969696969697, "grad_norm": 0.2597758173942566, "learning_rate": 4.13e-05, "loss": 0.4919, "step": 870 }, { "epoch": 79.0969696969697, "eval_loss": 0.5550717711448669, "eval_runtime": 2.1786, "eval_samples_per_second": 25.246, "eval_steps_per_second": 3.213, "step": 870 }, { "epoch": 79.19393939393939, "grad_norm": 0.27578315138816833, "learning_rate": 4.129e-05, "loss": 0.4752, "step": 871 }, { "epoch": 79.2909090909091, "grad_norm": 0.27824288606643677, "learning_rate": 4.1280000000000005e-05, "loss": 0.5307, "step": 872 }, { "epoch": 79.38787878787879, "grad_norm": 0.2966783046722412, "learning_rate": 4.127e-05, "loss": 0.4939, "step": 873 }, { "epoch": 79.48484848484848, "grad_norm": 0.33290615677833557, "learning_rate": 4.126e-05, "loss": 0.5487, "step": 874 }, { "epoch": 79.58181818181818, "grad_norm": 0.23399081826210022, "learning_rate": 4.125e-05, "loss": 0.5429, "step": 875 }, { "epoch": 79.67878787878787, "grad_norm": 0.2639947831630707, "learning_rate": 4.124e-05, "loss": 0.5252, "step": 876 }, { "epoch": 79.77575757575758, "grad_norm": 0.2671773433685303, "learning_rate": 4.123e-05, "loss": 0.5471, "step": 877 }, { "epoch": 79.87272727272727, "grad_norm": 0.259935200214386, "learning_rate": 4.122e-05, "loss": 0.5292, "step": 878 }, { "epoch": 79.96969696969697, "grad_norm": 0.26900020241737366, "learning_rate": 4.121000000000001e-05, "loss": 0.5124, "step": 879 }, { "epoch": 80.0, "grad_norm": 0.33917108178138733, "learning_rate": 4.12e-05, "loss": 0.4579, "step": 880 }, { "epoch": 80.0, "eval_loss": 0.5527308583259583, "eval_runtime": 2.1784, "eval_samples_per_second": 25.248, "eval_steps_per_second": 3.213, "step": 880 }, { "epoch": 80.0969696969697, "grad_norm": 0.27525821328163147, "learning_rate": 4.1190000000000004e-05, "loss": 0.5072, "step": 881 }, { "epoch": 80.19393939393939, "grad_norm": 0.3157818615436554, "learning_rate": 4.118e-05, "loss": 0.5834, "step": 882 }, { "epoch": 80.2909090909091, "grad_norm": 0.2659405767917633, "learning_rate": 4.117e-05, "loss": 0.5384, "step": 883 }, { "epoch": 80.38787878787879, "grad_norm": 0.22492793202400208, "learning_rate": 4.1160000000000006e-05, "loss": 0.5353, "step": 884 }, { "epoch": 80.48484848484848, "grad_norm": 0.21968522667884827, "learning_rate": 4.115e-05, "loss": 0.5242, "step": 885 }, { "epoch": 80.58181818181818, "grad_norm": 0.257949560880661, "learning_rate": 4.114e-05, "loss": 0.4986, "step": 886 }, { "epoch": 80.67878787878787, "grad_norm": 0.22139108180999756, "learning_rate": 4.113e-05, "loss": 0.4976, "step": 887 }, { "epoch": 80.77575757575758, "grad_norm": 0.2856335937976837, "learning_rate": 4.1120000000000006e-05, "loss": 0.4869, "step": 888 }, { "epoch": 80.87272727272727, "grad_norm": 0.3159530460834503, "learning_rate": 4.1110000000000005e-05, "loss": 0.5073, "step": 889 }, { "epoch": 80.96969696969697, "grad_norm": 0.26196572184562683, "learning_rate": 4.11e-05, "loss": 0.4938, "step": 890 }, { "epoch": 80.96969696969697, "eval_loss": 0.5511890649795532, "eval_runtime": 2.1854, "eval_samples_per_second": 25.167, "eval_steps_per_second": 3.203, "step": 890 }, { "epoch": 81.0, "grad_norm": 0.3370470702648163, "learning_rate": 4.109e-05, "loss": 0.4691, "step": 891 }, { "epoch": 81.0969696969697, "grad_norm": 0.26037460565567017, "learning_rate": 4.108e-05, "loss": 0.5557, "step": 892 }, { "epoch": 81.19393939393939, "grad_norm": 0.2754685878753662, "learning_rate": 4.1070000000000005e-05, "loss": 0.5391, "step": 893 }, { "epoch": 81.2909090909091, "grad_norm": 0.2869008481502533, "learning_rate": 4.106e-05, "loss": 0.4685, "step": 894 }, { "epoch": 81.38787878787879, "grad_norm": 0.31535306572914124, "learning_rate": 4.105e-05, "loss": 0.5187, "step": 895 }, { "epoch": 81.48484848484848, "grad_norm": 0.3106493353843689, "learning_rate": 4.104e-05, "loss": 0.5059, "step": 896 }, { "epoch": 81.58181818181818, "grad_norm": 0.2399621158838272, "learning_rate": 4.103e-05, "loss": 0.5069, "step": 897 }, { "epoch": 81.67878787878787, "grad_norm": 0.22072452306747437, "learning_rate": 4.1020000000000004e-05, "loss": 0.51, "step": 898 }, { "epoch": 81.77575757575758, "grad_norm": 0.23221911489963531, "learning_rate": 4.101e-05, "loss": 0.5053, "step": 899 }, { "epoch": 81.87272727272727, "grad_norm": 0.2527901530265808, "learning_rate": 4.1e-05, "loss": 0.5118, "step": 900 }, { "epoch": 81.87272727272727, "eval_loss": 0.5499869585037231, "eval_runtime": 2.1811, "eval_samples_per_second": 25.217, "eval_steps_per_second": 3.209, "step": 900 }, { "epoch": 81.96969696969697, "grad_norm": 0.29236477613449097, "learning_rate": 4.099e-05, "loss": 0.5386, "step": 901 }, { "epoch": 82.0, "grad_norm": 0.36997607350349426, "learning_rate": 4.0980000000000004e-05, "loss": 0.4296, "step": 902 }, { "epoch": 82.0969696969697, "grad_norm": 0.24829061329364777, "learning_rate": 4.097e-05, "loss": 0.5552, "step": 903 }, { "epoch": 82.19393939393939, "grad_norm": 0.25277000665664673, "learning_rate": 4.096e-05, "loss": 0.5306, "step": 904 }, { "epoch": 82.2909090909091, "grad_norm": 0.24040213227272034, "learning_rate": 4.095e-05, "loss": 0.5043, "step": 905 }, { "epoch": 82.38787878787879, "grad_norm": 0.26400551199913025, "learning_rate": 4.094e-05, "loss": 0.5272, "step": 906 }, { "epoch": 82.48484848484848, "grad_norm": 0.2064426690340042, "learning_rate": 4.093e-05, "loss": 0.4846, "step": 907 }, { "epoch": 82.58181818181818, "grad_norm": 0.22036471962928772, "learning_rate": 4.092e-05, "loss": 0.494, "step": 908 }, { "epoch": 82.67878787878787, "grad_norm": 0.2294999212026596, "learning_rate": 4.0910000000000006e-05, "loss": 0.5316, "step": 909 }, { "epoch": 82.77575757575758, "grad_norm": 0.29374563694000244, "learning_rate": 4.09e-05, "loss": 0.4586, "step": 910 }, { "epoch": 82.77575757575758, "eval_loss": 0.5479086637496948, "eval_runtime": 2.1669, "eval_samples_per_second": 25.382, "eval_steps_per_second": 3.23, "step": 910 }, { "epoch": 82.87272727272727, "grad_norm": 0.20776768028736115, "learning_rate": 4.089e-05, "loss": 0.5163, "step": 911 }, { "epoch": 82.96969696969697, "grad_norm": 0.23485852777957916, "learning_rate": 4.088e-05, "loss": 0.4974, "step": 912 }, { "epoch": 83.0, "grad_norm": 0.4788278639316559, "learning_rate": 4.087e-05, "loss": 0.5461, "step": 913 }, { "epoch": 83.0969696969697, "grad_norm": 0.20169512927532196, "learning_rate": 4.0860000000000005e-05, "loss": 0.4889, "step": 914 }, { "epoch": 83.19393939393939, "grad_norm": 0.20205354690551758, "learning_rate": 4.085e-05, "loss": 0.5038, "step": 915 }, { "epoch": 83.2909090909091, "grad_norm": 0.2369161695241928, "learning_rate": 4.084e-05, "loss": 0.538, "step": 916 }, { "epoch": 83.38787878787879, "grad_norm": 0.2776532471179962, "learning_rate": 4.083e-05, "loss": 0.5042, "step": 917 }, { "epoch": 83.48484848484848, "grad_norm": 0.27464279532432556, "learning_rate": 4.0820000000000006e-05, "loss": 0.5034, "step": 918 }, { "epoch": 83.58181818181818, "grad_norm": 0.2643680274486542, "learning_rate": 4.0810000000000004e-05, "loss": 0.5154, "step": 919 }, { "epoch": 83.67878787878787, "grad_norm": 0.2726304531097412, "learning_rate": 4.08e-05, "loss": 0.4715, "step": 920 }, { "epoch": 83.67878787878787, "eval_loss": 0.5463842749595642, "eval_runtime": 2.1765, "eval_samples_per_second": 25.269, "eval_steps_per_second": 3.216, "step": 920 }, { "epoch": 83.77575757575758, "grad_norm": 0.3199925422668457, "learning_rate": 4.079e-05, "loss": 0.5715, "step": 921 }, { "epoch": 83.87272727272727, "grad_norm": 0.19012175500392914, "learning_rate": 4.078e-05, "loss": 0.5037, "step": 922 }, { "epoch": 83.96969696969697, "grad_norm": 0.2449841946363449, "learning_rate": 4.0770000000000004e-05, "loss": 0.5269, "step": 923 }, { "epoch": 84.0, "grad_norm": 0.36201179027557373, "learning_rate": 4.076e-05, "loss": 0.3858, "step": 924 }, { "epoch": 84.0969696969697, "grad_norm": 0.20386870205402374, "learning_rate": 4.075e-05, "loss": 0.477, "step": 925 }, { "epoch": 84.19393939393939, "grad_norm": 0.25486937165260315, "learning_rate": 4.074e-05, "loss": 0.4891, "step": 926 }, { "epoch": 84.2909090909091, "grad_norm": 0.2490653544664383, "learning_rate": 4.0730000000000005e-05, "loss": 0.5217, "step": 927 }, { "epoch": 84.38787878787879, "grad_norm": 0.33604300022125244, "learning_rate": 4.072e-05, "loss": 0.5314, "step": 928 }, { "epoch": 84.48484848484848, "grad_norm": 0.21965400874614716, "learning_rate": 4.071e-05, "loss": 0.5255, "step": 929 }, { "epoch": 84.58181818181818, "grad_norm": 0.23073619604110718, "learning_rate": 4.07e-05, "loss": 0.456, "step": 930 }, { "epoch": 84.58181818181818, "eval_loss": 0.5450170636177063, "eval_runtime": 2.1803, "eval_samples_per_second": 25.226, "eval_steps_per_second": 3.211, "step": 930 }, { "epoch": 84.67878787878787, "grad_norm": 0.27645522356033325, "learning_rate": 4.069e-05, "loss": 0.4772, "step": 931 }, { "epoch": 84.77575757575758, "grad_norm": 0.27022767066955566, "learning_rate": 4.0680000000000004e-05, "loss": 0.5755, "step": 932 }, { "epoch": 84.87272727272727, "grad_norm": 0.24547351896762848, "learning_rate": 4.067e-05, "loss": 0.4969, "step": 933 }, { "epoch": 84.96969696969697, "grad_norm": 0.2812168598175049, "learning_rate": 4.066e-05, "loss": 0.5265, "step": 934 }, { "epoch": 85.0, "grad_norm": 0.36018022894859314, "learning_rate": 4.065e-05, "loss": 0.4702, "step": 935 }, { "epoch": 85.0969696969697, "grad_norm": 0.2520100176334381, "learning_rate": 4.064e-05, "loss": 0.4896, "step": 936 }, { "epoch": 85.19393939393939, "grad_norm": 0.3157733082771301, "learning_rate": 4.063e-05, "loss": 0.5089, "step": 937 }, { "epoch": 85.2909090909091, "grad_norm": 0.2828407883644104, "learning_rate": 4.062e-05, "loss": 0.4741, "step": 938 }, { "epoch": 85.38787878787879, "grad_norm": 0.29748284816741943, "learning_rate": 4.0610000000000006e-05, "loss": 0.5395, "step": 939 }, { "epoch": 85.48484848484848, "grad_norm": 0.2942025363445282, "learning_rate": 4.0600000000000004e-05, "loss": 0.5289, "step": 940 }, { "epoch": 85.48484848484848, "eval_loss": 0.5434708595275879, "eval_runtime": 2.1583, "eval_samples_per_second": 25.483, "eval_steps_per_second": 3.243, "step": 940 }, { "epoch": 85.58181818181818, "grad_norm": 0.22913864254951477, "learning_rate": 4.059e-05, "loss": 0.5366, "step": 941 }, { "epoch": 85.67878787878787, "grad_norm": 0.22568920254707336, "learning_rate": 4.058e-05, "loss": 0.4572, "step": 942 }, { "epoch": 85.77575757575758, "grad_norm": 0.31184083223342896, "learning_rate": 4.057e-05, "loss": 0.5577, "step": 943 }, { "epoch": 85.87272727272727, "grad_norm": 0.2624908983707428, "learning_rate": 4.0560000000000005e-05, "loss": 0.5237, "step": 944 }, { "epoch": 85.96969696969697, "grad_norm": 0.22147274017333984, "learning_rate": 4.055e-05, "loss": 0.4643, "step": 945 }, { "epoch": 86.0, "grad_norm": 0.3841789960861206, "learning_rate": 4.054e-05, "loss": 0.3985, "step": 946 }, { "epoch": 86.0969696969697, "grad_norm": 0.24668338894844055, "learning_rate": 4.053e-05, "loss": 0.5623, "step": 947 }, { "epoch": 86.19393939393939, "grad_norm": 0.2290932983160019, "learning_rate": 4.0520000000000005e-05, "loss": 0.4791, "step": 948 }, { "epoch": 86.2909090909091, "grad_norm": 0.25622695684432983, "learning_rate": 4.0510000000000003e-05, "loss": 0.458, "step": 949 }, { "epoch": 86.38787878787879, "grad_norm": 0.27793192863464355, "learning_rate": 4.05e-05, "loss": 0.5233, "step": 950 }, { "epoch": 86.38787878787879, "eval_loss": 0.5420384407043457, "eval_runtime": 2.1543, "eval_samples_per_second": 25.53, "eval_steps_per_second": 3.249, "step": 950 }, { "epoch": 86.48484848484848, "grad_norm": 0.290298193693161, "learning_rate": 4.049e-05, "loss": 0.5017, "step": 951 }, { "epoch": 86.58181818181818, "grad_norm": 0.24654896557331085, "learning_rate": 4.048e-05, "loss": 0.4799, "step": 952 }, { "epoch": 86.67878787878787, "grad_norm": 0.34982696175575256, "learning_rate": 4.0470000000000004e-05, "loss": 0.5292, "step": 953 }, { "epoch": 86.77575757575758, "grad_norm": 0.2249320149421692, "learning_rate": 4.046e-05, "loss": 0.5497, "step": 954 }, { "epoch": 86.87272727272727, "grad_norm": 0.31159865856170654, "learning_rate": 4.045000000000001e-05, "loss": 0.4766, "step": 955 }, { "epoch": 86.96969696969697, "grad_norm": 0.3017675578594208, "learning_rate": 4.044e-05, "loss": 0.4851, "step": 956 }, { "epoch": 87.0, "grad_norm": 0.42811262607574463, "learning_rate": 4.0430000000000004e-05, "loss": 0.4453, "step": 957 }, { "epoch": 87.0969696969697, "grad_norm": 0.25056886672973633, "learning_rate": 4.042e-05, "loss": 0.4505, "step": 958 }, { "epoch": 87.19393939393939, "grad_norm": 0.23695793747901917, "learning_rate": 4.041e-05, "loss": 0.5128, "step": 959 }, { "epoch": 87.2909090909091, "grad_norm": 0.23140308260917664, "learning_rate": 4.0400000000000006e-05, "loss": 0.4859, "step": 960 }, { "epoch": 87.2909090909091, "eval_loss": 0.5406550765037537, "eval_runtime": 2.1825, "eval_samples_per_second": 25.2, "eval_steps_per_second": 3.207, "step": 960 }, { "epoch": 87.38787878787879, "grad_norm": 0.2655631899833679, "learning_rate": 4.039e-05, "loss": 0.4867, "step": 961 }, { "epoch": 87.48484848484848, "grad_norm": 0.41470491886138916, "learning_rate": 4.038e-05, "loss": 0.528, "step": 962 }, { "epoch": 87.58181818181818, "grad_norm": 0.29415225982666016, "learning_rate": 4.037e-05, "loss": 0.4967, "step": 963 }, { "epoch": 87.67878787878787, "grad_norm": 0.2565857470035553, "learning_rate": 4.0360000000000007e-05, "loss": 0.4967, "step": 964 }, { "epoch": 87.77575757575758, "grad_norm": 0.23314039409160614, "learning_rate": 4.0350000000000005e-05, "loss": 0.5357, "step": 965 }, { "epoch": 87.87272727272727, "grad_norm": 0.2893287241458893, "learning_rate": 4.034e-05, "loss": 0.4635, "step": 966 }, { "epoch": 87.96969696969697, "grad_norm": 0.2738717198371887, "learning_rate": 4.033e-05, "loss": 0.5388, "step": 967 }, { "epoch": 88.0, "grad_norm": 0.3843998610973358, "learning_rate": 4.032e-05, "loss": 0.5291, "step": 968 }, { "epoch": 88.0969696969697, "grad_norm": 0.2288815975189209, "learning_rate": 4.0310000000000005e-05, "loss": 0.5184, "step": 969 }, { "epoch": 88.19393939393939, "grad_norm": 0.2845372259616852, "learning_rate": 4.0300000000000004e-05, "loss": 0.5061, "step": 970 }, { "epoch": 88.19393939393939, "eval_loss": 0.5385352373123169, "eval_runtime": 2.175, "eval_samples_per_second": 25.287, "eval_steps_per_second": 3.218, "step": 970 }, { "epoch": 88.2909090909091, "grad_norm": 0.24731485545635223, "learning_rate": 4.029e-05, "loss": 0.5337, "step": 971 }, { "epoch": 88.38787878787879, "grad_norm": 0.29482629895210266, "learning_rate": 4.028e-05, "loss": 0.4525, "step": 972 }, { "epoch": 88.48484848484848, "grad_norm": 0.30363136529922485, "learning_rate": 4.027e-05, "loss": 0.5321, "step": 973 }, { "epoch": 88.58181818181818, "grad_norm": 0.2996557056903839, "learning_rate": 4.0260000000000004e-05, "loss": 0.4991, "step": 974 }, { "epoch": 88.67878787878787, "grad_norm": 0.2432321459054947, "learning_rate": 4.025e-05, "loss": 0.5032, "step": 975 }, { "epoch": 88.77575757575758, "grad_norm": 0.24066884815692902, "learning_rate": 4.024e-05, "loss": 0.5177, "step": 976 }, { "epoch": 88.87272727272727, "grad_norm": 0.2924744188785553, "learning_rate": 4.023e-05, "loss": 0.4963, "step": 977 }, { "epoch": 88.96969696969697, "grad_norm": 0.26147395372390747, "learning_rate": 4.0220000000000005e-05, "loss": 0.445, "step": 978 }, { "epoch": 89.0, "grad_norm": 0.4114329218864441, "learning_rate": 4.021e-05, "loss": 0.4469, "step": 979 }, { "epoch": 89.0969696969697, "grad_norm": 0.2708848714828491, "learning_rate": 4.02e-05, "loss": 0.5171, "step": 980 }, { "epoch": 89.0969696969697, "eval_loss": 0.5373245477676392, "eval_runtime": 2.1668, "eval_samples_per_second": 25.383, "eval_steps_per_second": 3.231, "step": 980 }, { "epoch": 89.19393939393939, "grad_norm": 0.24809399247169495, "learning_rate": 4.019e-05, "loss": 0.4804, "step": 981 }, { "epoch": 89.2909090909091, "grad_norm": 0.244582399725914, "learning_rate": 4.018e-05, "loss": 0.4539, "step": 982 }, { "epoch": 89.38787878787879, "grad_norm": 0.238327294588089, "learning_rate": 4.017e-05, "loss": 0.4755, "step": 983 }, { "epoch": 89.48484848484848, "grad_norm": 0.27144134044647217, "learning_rate": 4.016e-05, "loss": 0.4962, "step": 984 }, { "epoch": 89.58181818181818, "grad_norm": 0.27538859844207764, "learning_rate": 4.015000000000001e-05, "loss": 0.4898, "step": 985 }, { "epoch": 89.67878787878787, "grad_norm": 0.23520852625370026, "learning_rate": 4.014e-05, "loss": 0.5117, "step": 986 }, { "epoch": 89.77575757575758, "grad_norm": 0.3188869059085846, "learning_rate": 4.0130000000000004e-05, "loss": 0.4805, "step": 987 }, { "epoch": 89.87272727272727, "grad_norm": 0.25604870915412903, "learning_rate": 4.012e-05, "loss": 0.5397, "step": 988 }, { "epoch": 89.96969696969697, "grad_norm": 0.23663835227489471, "learning_rate": 4.011e-05, "loss": 0.5087, "step": 989 }, { "epoch": 90.0, "grad_norm": 0.42565521597862244, "learning_rate": 4.0100000000000006e-05, "loss": 0.5336, "step": 990 }, { "epoch": 90.0, "eval_loss": 0.5357187390327454, "eval_runtime": 2.1759, "eval_samples_per_second": 25.277, "eval_steps_per_second": 3.217, "step": 990 }, { "epoch": 90.0969696969697, "grad_norm": 0.21845504641532898, "learning_rate": 4.009e-05, "loss": 0.4928, "step": 991 }, { "epoch": 90.19393939393939, "grad_norm": 0.22380469739437103, "learning_rate": 4.008e-05, "loss": 0.4825, "step": 992 }, { "epoch": 90.2909090909091, "grad_norm": 0.19843821227550507, "learning_rate": 4.007e-05, "loss": 0.4954, "step": 993 }, { "epoch": 90.38787878787879, "grad_norm": 0.21744690835475922, "learning_rate": 4.0060000000000006e-05, "loss": 0.4988, "step": 994 }, { "epoch": 90.48484848484848, "grad_norm": 0.3063651919364929, "learning_rate": 4.0050000000000004e-05, "loss": 0.5554, "step": 995 }, { "epoch": 90.58181818181818, "grad_norm": 0.20970290899276733, "learning_rate": 4.004e-05, "loss": 0.4729, "step": 996 }, { "epoch": 90.67878787878787, "grad_norm": 0.2944852113723755, "learning_rate": 4.003e-05, "loss": 0.4766, "step": 997 }, { "epoch": 90.77575757575758, "grad_norm": 0.30164456367492676, "learning_rate": 4.002e-05, "loss": 0.5203, "step": 998 }, { "epoch": 90.87272727272727, "grad_norm": 0.2369176745414734, "learning_rate": 4.0010000000000005e-05, "loss": 0.4602, "step": 999 }, { "epoch": 90.96969696969697, "grad_norm": 0.2505776286125183, "learning_rate": 4e-05, "loss": 0.4625, "step": 1000 }, { "epoch": 90.96969696969697, "eval_loss": 0.53392094373703, "eval_runtime": 2.1907, "eval_samples_per_second": 25.107, "eval_steps_per_second": 3.195, "step": 1000 }, { "epoch": 91.0, "grad_norm": 0.4616742730140686, "learning_rate": 3.999e-05, "loss": 0.5795, "step": 1001 }, { "epoch": 91.0969696969697, "grad_norm": 0.22923487424850464, "learning_rate": 3.998e-05, "loss": 0.4707, "step": 1002 }, { "epoch": 91.19393939393939, "grad_norm": 0.2569350600242615, "learning_rate": 3.9970000000000005e-05, "loss": 0.5103, "step": 1003 }, { "epoch": 91.2909090909091, "grad_norm": 0.2466253936290741, "learning_rate": 3.9960000000000004e-05, "loss": 0.53, "step": 1004 }, { "epoch": 91.38787878787879, "grad_norm": 0.2187167853116989, "learning_rate": 3.995e-05, "loss": 0.4855, "step": 1005 }, { "epoch": 91.48484848484848, "grad_norm": 0.2226039469242096, "learning_rate": 3.994e-05, "loss": 0.4987, "step": 1006 }, { "epoch": 91.58181818181818, "grad_norm": 0.3207626938819885, "learning_rate": 3.993e-05, "loss": 0.5082, "step": 1007 }, { "epoch": 91.67878787878787, "grad_norm": 0.2705640196800232, "learning_rate": 3.9920000000000004e-05, "loss": 0.4889, "step": 1008 }, { "epoch": 91.77575757575758, "grad_norm": 0.2449875921010971, "learning_rate": 3.991e-05, "loss": 0.4749, "step": 1009 }, { "epoch": 91.87272727272727, "grad_norm": 0.21206451952457428, "learning_rate": 3.99e-05, "loss": 0.4607, "step": 1010 }, { "epoch": 91.87272727272727, "eval_loss": 0.5326685905456543, "eval_runtime": 2.188, "eval_samples_per_second": 25.137, "eval_steps_per_second": 3.199, "step": 1010 }, { "epoch": 91.96969696969697, "grad_norm": 0.26551949977874756, "learning_rate": 3.989e-05, "loss": 0.4945, "step": 1011 }, { "epoch": 92.0, "grad_norm": 0.37353646755218506, "learning_rate": 3.988e-05, "loss": 0.4931, "step": 1012 }, { "epoch": 92.0969696969697, "grad_norm": 0.21740274131298065, "learning_rate": 3.987e-05, "loss": 0.5023, "step": 1013 }, { "epoch": 92.19393939393939, "grad_norm": 0.2506203055381775, "learning_rate": 3.986e-05, "loss": 0.509, "step": 1014 }, { "epoch": 92.2909090909091, "grad_norm": 0.22760456800460815, "learning_rate": 3.9850000000000006e-05, "loss": 0.5169, "step": 1015 }, { "epoch": 92.38787878787879, "grad_norm": 0.30753859877586365, "learning_rate": 3.984e-05, "loss": 0.5013, "step": 1016 }, { "epoch": 92.48484848484848, "grad_norm": 0.24482981860637665, "learning_rate": 3.983e-05, "loss": 0.5132, "step": 1017 }, { "epoch": 92.58181818181818, "grad_norm": 0.31413623690605164, "learning_rate": 3.982e-05, "loss": 0.4936, "step": 1018 }, { "epoch": 92.67878787878787, "grad_norm": 0.22564786672592163, "learning_rate": 3.981e-05, "loss": 0.4823, "step": 1019 }, { "epoch": 92.77575757575758, "grad_norm": 0.3035551607608795, "learning_rate": 3.9800000000000005e-05, "loss": 0.4578, "step": 1020 }, { "epoch": 92.77575757575758, "eval_loss": 0.5315625071525574, "eval_runtime": 2.1756, "eval_samples_per_second": 25.281, "eval_steps_per_second": 3.218, "step": 1020 }, { "epoch": 92.87272727272727, "grad_norm": 0.2384045273065567, "learning_rate": 3.979e-05, "loss": 0.4449, "step": 1021 }, { "epoch": 92.96969696969697, "grad_norm": 0.25220656394958496, "learning_rate": 3.978e-05, "loss": 0.4847, "step": 1022 }, { "epoch": 93.0, "grad_norm": 0.2742287218570709, "learning_rate": 3.977e-05, "loss": 0.4884, "step": 1023 }, { "epoch": 93.0969696969697, "grad_norm": 0.25580084323883057, "learning_rate": 3.9760000000000006e-05, "loss": 0.4627, "step": 1024 }, { "epoch": 93.19393939393939, "grad_norm": 0.29719841480255127, "learning_rate": 3.9750000000000004e-05, "loss": 0.4855, "step": 1025 }, { "epoch": 93.2909090909091, "grad_norm": 0.3261563777923584, "learning_rate": 3.974e-05, "loss": 0.4764, "step": 1026 }, { "epoch": 93.38787878787879, "grad_norm": 0.2409381866455078, "learning_rate": 3.973e-05, "loss": 0.4672, "step": 1027 }, { "epoch": 93.48484848484848, "grad_norm": 0.26164528727531433, "learning_rate": 3.972e-05, "loss": 0.5152, "step": 1028 }, { "epoch": 93.58181818181818, "grad_norm": 0.2876894474029541, "learning_rate": 3.9710000000000004e-05, "loss": 0.521, "step": 1029 }, { "epoch": 93.67878787878787, "grad_norm": 0.22141848504543304, "learning_rate": 3.97e-05, "loss": 0.4715, "step": 1030 }, { "epoch": 93.67878787878787, "eval_loss": 0.5300446152687073, "eval_runtime": 2.1679, "eval_samples_per_second": 25.37, "eval_steps_per_second": 3.229, "step": 1030 }, { "epoch": 93.77575757575758, "grad_norm": 0.29566359519958496, "learning_rate": 3.969e-05, "loss": 0.5235, "step": 1031 }, { "epoch": 93.87272727272727, "grad_norm": 0.34124600887298584, "learning_rate": 3.968e-05, "loss": 0.4517, "step": 1032 }, { "epoch": 93.96969696969697, "grad_norm": 0.27476051449775696, "learning_rate": 3.9670000000000005e-05, "loss": 0.5173, "step": 1033 }, { "epoch": 94.0, "grad_norm": 0.38910970091819763, "learning_rate": 3.966e-05, "loss": 0.466, "step": 1034 }, { "epoch": 94.0969696969697, "grad_norm": 0.25072184205055237, "learning_rate": 3.965e-05, "loss": 0.5121, "step": 1035 }, { "epoch": 94.19393939393939, "grad_norm": 0.34341293573379517, "learning_rate": 3.964e-05, "loss": 0.435, "step": 1036 }, { "epoch": 94.2909090909091, "grad_norm": 0.2264951914548874, "learning_rate": 3.963e-05, "loss": 0.4919, "step": 1037 }, { "epoch": 94.38787878787879, "grad_norm": 0.24091920256614685, "learning_rate": 3.9620000000000004e-05, "loss": 0.4843, "step": 1038 }, { "epoch": 94.48484848484848, "grad_norm": 0.2943407893180847, "learning_rate": 3.961e-05, "loss": 0.5102, "step": 1039 }, { "epoch": 94.58181818181818, "grad_norm": 0.26254013180732727, "learning_rate": 3.960000000000001e-05, "loss": 0.4832, "step": 1040 }, { "epoch": 94.58181818181818, "eval_loss": 0.5287523865699768, "eval_runtime": 2.1727, "eval_samples_per_second": 25.314, "eval_steps_per_second": 3.222, "step": 1040 }, { "epoch": 94.67878787878787, "grad_norm": 0.3030388057231903, "learning_rate": 3.959e-05, "loss": 0.4926, "step": 1041 }, { "epoch": 94.77575757575758, "grad_norm": 0.29376158118247986, "learning_rate": 3.958e-05, "loss": 0.51, "step": 1042 }, { "epoch": 94.87272727272727, "grad_norm": 0.26268187165260315, "learning_rate": 3.957e-05, "loss": 0.4774, "step": 1043 }, { "epoch": 94.96969696969697, "grad_norm": 0.2799707055091858, "learning_rate": 3.956e-05, "loss": 0.464, "step": 1044 }, { "epoch": 95.0, "grad_norm": 0.5625638961791992, "learning_rate": 3.9550000000000006e-05, "loss": 0.5032, "step": 1045 }, { "epoch": 95.0969696969697, "grad_norm": 0.2317277193069458, "learning_rate": 3.954e-05, "loss": 0.4852, "step": 1046 }, { "epoch": 95.19393939393939, "grad_norm": 0.3576275706291199, "learning_rate": 3.953e-05, "loss": 0.4789, "step": 1047 }, { "epoch": 95.2909090909091, "grad_norm": 0.26975786685943604, "learning_rate": 3.952e-05, "loss": 0.5058, "step": 1048 }, { "epoch": 95.38787878787879, "grad_norm": 0.23840966820716858, "learning_rate": 3.951e-05, "loss": 0.541, "step": 1049 }, { "epoch": 95.48484848484848, "grad_norm": 0.30056631565093994, "learning_rate": 3.9500000000000005e-05, "loss": 0.4606, "step": 1050 }, { "epoch": 95.48484848484848, "eval_loss": 0.5273352861404419, "eval_runtime": 2.2113, "eval_samples_per_second": 24.872, "eval_steps_per_second": 3.166, "step": 1050 }, { "epoch": 95.58181818181818, "grad_norm": 0.22875840961933136, "learning_rate": 3.9489999999999996e-05, "loss": 0.4746, "step": 1051 }, { "epoch": 95.67878787878787, "grad_norm": 0.29144978523254395, "learning_rate": 3.948e-05, "loss": 0.4952, "step": 1052 }, { "epoch": 95.77575757575758, "grad_norm": 0.3198563754558563, "learning_rate": 3.947e-05, "loss": 0.4659, "step": 1053 }, { "epoch": 95.87272727272727, "grad_norm": 0.27438458800315857, "learning_rate": 3.9460000000000005e-05, "loss": 0.4623, "step": 1054 }, { "epoch": 95.96969696969697, "grad_norm": 0.2391417771577835, "learning_rate": 3.9450000000000003e-05, "loss": 0.5001, "step": 1055 }, { "epoch": 96.0, "grad_norm": 0.40089306235313416, "learning_rate": 3.944e-05, "loss": 0.415, "step": 1056 }, { "epoch": 96.0969696969697, "grad_norm": 0.26197531819343567, "learning_rate": 3.943e-05, "loss": 0.5019, "step": 1057 }, { "epoch": 96.19393939393939, "grad_norm": 0.3073684573173523, "learning_rate": 3.942e-05, "loss": 0.4635, "step": 1058 }, { "epoch": 96.2909090909091, "grad_norm": 0.2256932556629181, "learning_rate": 3.9410000000000004e-05, "loss": 0.535, "step": 1059 }, { "epoch": 96.38787878787879, "grad_norm": 0.30590319633483887, "learning_rate": 3.94e-05, "loss": 0.4781, "step": 1060 }, { "epoch": 96.38787878787879, "eval_loss": 0.5255498290061951, "eval_runtime": 2.1691, "eval_samples_per_second": 25.356, "eval_steps_per_second": 3.227, "step": 1060 }, { "epoch": 96.48484848484848, "grad_norm": 0.28140392899513245, "learning_rate": 3.939e-05, "loss": 0.4884, "step": 1061 }, { "epoch": 96.58181818181818, "grad_norm": 0.229629784822464, "learning_rate": 3.938e-05, "loss": 0.4675, "step": 1062 }, { "epoch": 96.67878787878787, "grad_norm": 0.24081185460090637, "learning_rate": 3.9370000000000004e-05, "loss": 0.4826, "step": 1063 }, { "epoch": 96.77575757575758, "grad_norm": 0.2174087017774582, "learning_rate": 3.936e-05, "loss": 0.4502, "step": 1064 }, { "epoch": 96.87272727272727, "grad_norm": 0.24293029308319092, "learning_rate": 3.935e-05, "loss": 0.4529, "step": 1065 }, { "epoch": 96.96969696969697, "grad_norm": 0.26654404401779175, "learning_rate": 3.9340000000000006e-05, "loss": 0.4891, "step": 1066 }, { "epoch": 97.0, "grad_norm": 0.4864083230495453, "learning_rate": 3.933e-05, "loss": 0.5446, "step": 1067 }, { "epoch": 97.0969696969697, "grad_norm": 0.24581874907016754, "learning_rate": 3.932e-05, "loss": 0.5138, "step": 1068 }, { "epoch": 97.19393939393939, "grad_norm": 0.24638426303863525, "learning_rate": 3.931e-05, "loss": 0.5056, "step": 1069 }, { "epoch": 97.2909090909091, "grad_norm": 0.24548788368701935, "learning_rate": 3.9300000000000007e-05, "loss": 0.5096, "step": 1070 }, { "epoch": 97.2909090909091, "eval_loss": 0.524466335773468, "eval_runtime": 2.1898, "eval_samples_per_second": 25.117, "eval_steps_per_second": 3.197, "step": 1070 }, { "epoch": 97.38787878787879, "grad_norm": 0.3515244424343109, "learning_rate": 3.9290000000000005e-05, "loss": 0.4682, "step": 1071 }, { "epoch": 97.48484848484848, "grad_norm": 0.334836483001709, "learning_rate": 3.9280000000000003e-05, "loss": 0.4996, "step": 1072 }, { "epoch": 97.58181818181818, "grad_norm": 0.3351961076259613, "learning_rate": 3.927e-05, "loss": 0.435, "step": 1073 }, { "epoch": 97.67878787878787, "grad_norm": 0.2502618134021759, "learning_rate": 3.926e-05, "loss": 0.4569, "step": 1074 }, { "epoch": 97.77575757575758, "grad_norm": 0.25060755014419556, "learning_rate": 3.9250000000000005e-05, "loss": 0.472, "step": 1075 }, { "epoch": 97.87272727272727, "grad_norm": 0.29755687713623047, "learning_rate": 3.9240000000000004e-05, "loss": 0.4862, "step": 1076 }, { "epoch": 97.96969696969697, "grad_norm": 0.44427555799484253, "learning_rate": 3.923e-05, "loss": 0.4797, "step": 1077 }, { "epoch": 98.0, "grad_norm": 0.3197348415851593, "learning_rate": 3.922e-05, "loss": 0.4355, "step": 1078 }, { "epoch": 98.0969696969697, "grad_norm": 0.2131693959236145, "learning_rate": 3.921e-05, "loss": 0.4879, "step": 1079 }, { "epoch": 98.19393939393939, "grad_norm": 0.2979012131690979, "learning_rate": 3.9200000000000004e-05, "loss": 0.4585, "step": 1080 }, { "epoch": 98.19393939393939, "eval_loss": 0.5229376554489136, "eval_runtime": 2.1836, "eval_samples_per_second": 25.187, "eval_steps_per_second": 3.206, "step": 1080 }, { "epoch": 98.2909090909091, "grad_norm": 0.3021116554737091, "learning_rate": 3.919e-05, "loss": 0.4985, "step": 1081 }, { "epoch": 98.38787878787879, "grad_norm": 0.28391754627227783, "learning_rate": 3.918e-05, "loss": 0.434, "step": 1082 }, { "epoch": 98.48484848484848, "grad_norm": 0.32781583070755005, "learning_rate": 3.917e-05, "loss": 0.4682, "step": 1083 }, { "epoch": 98.58181818181818, "grad_norm": 0.28897055983543396, "learning_rate": 3.9160000000000005e-05, "loss": 0.4744, "step": 1084 }, { "epoch": 98.67878787878787, "grad_norm": 0.33773258328437805, "learning_rate": 3.915e-05, "loss": 0.5519, "step": 1085 }, { "epoch": 98.77575757575758, "grad_norm": 0.23259370028972626, "learning_rate": 3.914e-05, "loss": 0.4688, "step": 1086 }, { "epoch": 98.87272727272727, "grad_norm": 0.27739864587783813, "learning_rate": 3.913e-05, "loss": 0.4605, "step": 1087 }, { "epoch": 98.96969696969697, "grad_norm": 0.2524653375148773, "learning_rate": 3.912e-05, "loss": 0.4761, "step": 1088 }, { "epoch": 99.0, "grad_norm": 0.5492696762084961, "learning_rate": 3.911e-05, "loss": 0.5231, "step": 1089 }, { "epoch": 99.0969696969697, "grad_norm": 0.24189777672290802, "learning_rate": 3.91e-05, "loss": 0.4844, "step": 1090 }, { "epoch": 99.0969696969697, "eval_loss": 0.5221452713012695, "eval_runtime": 2.1659, "eval_samples_per_second": 25.394, "eval_steps_per_second": 3.232, "step": 1090 }, { "epoch": 99.19393939393939, "grad_norm": 0.2546369135379791, "learning_rate": 3.909000000000001e-05, "loss": 0.5451, "step": 1091 }, { "epoch": 99.2909090909091, "grad_norm": 0.343545526266098, "learning_rate": 3.908e-05, "loss": 0.4831, "step": 1092 }, { "epoch": 99.38787878787879, "grad_norm": 0.2641466557979584, "learning_rate": 3.9070000000000004e-05, "loss": 0.4715, "step": 1093 }, { "epoch": 99.48484848484848, "grad_norm": 0.2601754665374756, "learning_rate": 3.906e-05, "loss": 0.4429, "step": 1094 }, { "epoch": 99.58181818181818, "grad_norm": 0.2309015393257141, "learning_rate": 3.905e-05, "loss": 0.4734, "step": 1095 }, { "epoch": 99.67878787878787, "grad_norm": 0.24893411993980408, "learning_rate": 3.9040000000000006e-05, "loss": 0.4558, "step": 1096 }, { "epoch": 99.77575757575758, "grad_norm": 0.2772034704685211, "learning_rate": 3.903e-05, "loss": 0.4914, "step": 1097 }, { "epoch": 99.87272727272727, "grad_norm": 0.2813556492328644, "learning_rate": 3.902e-05, "loss": 0.5145, "step": 1098 }, { "epoch": 99.96969696969697, "grad_norm": 0.29801681637763977, "learning_rate": 3.901e-05, "loss": 0.4345, "step": 1099 }, { "epoch": 100.0, "grad_norm": 0.46238136291503906, "learning_rate": 3.9000000000000006e-05, "loss": 0.4123, "step": 1100 }, { "epoch": 100.0, "eval_loss": 0.5208865404129028, "eval_runtime": 2.1746, "eval_samples_per_second": 25.292, "eval_steps_per_second": 3.219, "step": 1100 }, { "epoch": 100.0969696969697, "grad_norm": 0.2547599673271179, "learning_rate": 3.8990000000000004e-05, "loss": 0.4441, "step": 1101 }, { "epoch": 100.19393939393939, "grad_norm": 0.2973734736442566, "learning_rate": 3.898e-05, "loss": 0.5059, "step": 1102 }, { "epoch": 100.2909090909091, "grad_norm": 0.2850284278392792, "learning_rate": 3.897e-05, "loss": 0.5007, "step": 1103 }, { "epoch": 100.38787878787879, "grad_norm": 0.2912881076335907, "learning_rate": 3.896e-05, "loss": 0.4585, "step": 1104 }, { "epoch": 100.48484848484848, "grad_norm": 0.295784592628479, "learning_rate": 3.8950000000000005e-05, "loss": 0.4647, "step": 1105 }, { "epoch": 100.58181818181818, "grad_norm": 0.22379592061042786, "learning_rate": 3.894e-05, "loss": 0.4693, "step": 1106 }, { "epoch": 100.67878787878787, "grad_norm": 0.21056662499904633, "learning_rate": 3.893e-05, "loss": 0.4755, "step": 1107 }, { "epoch": 100.77575757575758, "grad_norm": 0.23073583841323853, "learning_rate": 3.892e-05, "loss": 0.4556, "step": 1108 }, { "epoch": 100.87272727272727, "grad_norm": 0.327334463596344, "learning_rate": 3.8910000000000005e-05, "loss": 0.4741, "step": 1109 }, { "epoch": 100.96969696969697, "grad_norm": 0.30131009221076965, "learning_rate": 3.8900000000000004e-05, "loss": 0.511, "step": 1110 }, { "epoch": 100.96969696969697, "eval_loss": 0.519778847694397, "eval_runtime": 2.1764, "eval_samples_per_second": 25.272, "eval_steps_per_second": 3.216, "step": 1110 }, { "epoch": 101.0, "grad_norm": 0.3921085298061371, "learning_rate": 3.889e-05, "loss": 0.4754, "step": 1111 }, { "epoch": 101.0969696969697, "grad_norm": 0.22144660353660583, "learning_rate": 3.888e-05, "loss": 0.4728, "step": 1112 }, { "epoch": 101.19393939393939, "grad_norm": 0.3088531196117401, "learning_rate": 3.887e-05, "loss": 0.5127, "step": 1113 }, { "epoch": 101.2909090909091, "grad_norm": 0.2640801668167114, "learning_rate": 3.8860000000000004e-05, "loss": 0.4583, "step": 1114 }, { "epoch": 101.38787878787879, "grad_norm": 0.2525095045566559, "learning_rate": 3.885e-05, "loss": 0.5007, "step": 1115 }, { "epoch": 101.48484848484848, "grad_norm": 0.26294276118278503, "learning_rate": 3.884e-05, "loss": 0.4766, "step": 1116 }, { "epoch": 101.58181818181818, "grad_norm": 0.21842017769813538, "learning_rate": 3.883e-05, "loss": 0.4665, "step": 1117 }, { "epoch": 101.67878787878787, "grad_norm": 0.30087506771087646, "learning_rate": 3.882e-05, "loss": 0.5296, "step": 1118 }, { "epoch": 101.77575757575758, "grad_norm": 0.2511281967163086, "learning_rate": 3.881e-05, "loss": 0.4662, "step": 1119 }, { "epoch": 101.87272727272727, "grad_norm": 0.2348358929157257, "learning_rate": 3.88e-05, "loss": 0.4281, "step": 1120 }, { "epoch": 101.87272727272727, "eval_loss": 0.5181769132614136, "eval_runtime": 2.1723, "eval_samples_per_second": 25.319, "eval_steps_per_second": 3.222, "step": 1120 }, { "epoch": 101.96969696969697, "grad_norm": 0.20796290040016174, "learning_rate": 3.8790000000000006e-05, "loss": 0.4557, "step": 1121 }, { "epoch": 102.0, "grad_norm": 0.3797859251499176, "learning_rate": 3.878e-05, "loss": 0.3847, "step": 1122 }, { "epoch": 102.0969696969697, "grad_norm": 0.25829294323921204, "learning_rate": 3.877e-05, "loss": 0.4655, "step": 1123 }, { "epoch": 102.19393939393939, "grad_norm": 0.2667910158634186, "learning_rate": 3.876e-05, "loss": 0.4809, "step": 1124 }, { "epoch": 102.2909090909091, "grad_norm": 0.30939432978630066, "learning_rate": 3.875e-05, "loss": 0.4723, "step": 1125 }, { "epoch": 102.38787878787879, "grad_norm": 0.2525250315666199, "learning_rate": 3.8740000000000005e-05, "loss": 0.4678, "step": 1126 }, { "epoch": 102.48484848484848, "grad_norm": 0.2383662611246109, "learning_rate": 3.873e-05, "loss": 0.4835, "step": 1127 }, { "epoch": 102.58181818181818, "grad_norm": 0.23342494666576385, "learning_rate": 3.872e-05, "loss": 0.4867, "step": 1128 }, { "epoch": 102.67878787878787, "grad_norm": 0.2512975037097931, "learning_rate": 3.871e-05, "loss": 0.4751, "step": 1129 }, { "epoch": 102.77575757575758, "grad_norm": 0.2931779623031616, "learning_rate": 3.8700000000000006e-05, "loss": 0.5153, "step": 1130 }, { "epoch": 102.77575757575758, "eval_loss": 0.5171474814414978, "eval_runtime": 2.1713, "eval_samples_per_second": 25.331, "eval_steps_per_second": 3.224, "step": 1130 }, { "epoch": 102.87272727272727, "grad_norm": 0.3106895089149475, "learning_rate": 3.8690000000000004e-05, "loss": 0.4588, "step": 1131 }, { "epoch": 102.96969696969697, "grad_norm": 0.24597759544849396, "learning_rate": 3.868e-05, "loss": 0.4184, "step": 1132 }, { "epoch": 103.0, "grad_norm": 0.3818826973438263, "learning_rate": 3.867e-05, "loss": 0.4709, "step": 1133 }, { "epoch": 103.0969696969697, "grad_norm": 0.23651278018951416, "learning_rate": 3.866e-05, "loss": 0.4789, "step": 1134 }, { "epoch": 103.19393939393939, "grad_norm": 0.3078266680240631, "learning_rate": 3.8650000000000004e-05, "loss": 0.5128, "step": 1135 }, { "epoch": 103.2909090909091, "grad_norm": 0.329988569021225, "learning_rate": 3.864e-05, "loss": 0.412, "step": 1136 }, { "epoch": 103.38787878787879, "grad_norm": 0.28413721919059753, "learning_rate": 3.863e-05, "loss": 0.4525, "step": 1137 }, { "epoch": 103.48484848484848, "grad_norm": 0.24670186638832092, "learning_rate": 3.862e-05, "loss": 0.5062, "step": 1138 }, { "epoch": 103.58181818181818, "grad_norm": 0.24704158306121826, "learning_rate": 3.8610000000000005e-05, "loss": 0.4857, "step": 1139 }, { "epoch": 103.67878787878787, "grad_norm": 0.20425106585025787, "learning_rate": 3.86e-05, "loss": 0.4667, "step": 1140 }, { "epoch": 103.67878787878787, "eval_loss": 0.5160987973213196, "eval_runtime": 2.1933, "eval_samples_per_second": 25.076, "eval_steps_per_second": 3.191, "step": 1140 }, { "epoch": 103.77575757575758, "grad_norm": 0.2687723934650421, "learning_rate": 3.859e-05, "loss": 0.4881, "step": 1141 }, { "epoch": 103.87272727272727, "grad_norm": 0.21118372678756714, "learning_rate": 3.858e-05, "loss": 0.4545, "step": 1142 }, { "epoch": 103.96969696969697, "grad_norm": 0.24843823909759521, "learning_rate": 3.857e-05, "loss": 0.4524, "step": 1143 }, { "epoch": 104.0, "grad_norm": 0.441841185092926, "learning_rate": 3.8560000000000004e-05, "loss": 0.4624, "step": 1144 }, { "epoch": 104.0969696969697, "grad_norm": 0.2640858590602875, "learning_rate": 3.855e-05, "loss": 0.4837, "step": 1145 }, { "epoch": 104.19393939393939, "grad_norm": 0.35844892263412476, "learning_rate": 3.854000000000001e-05, "loss": 0.5218, "step": 1146 }, { "epoch": 104.2909090909091, "grad_norm": 0.26051416993141174, "learning_rate": 3.853e-05, "loss": 0.4579, "step": 1147 }, { "epoch": 104.38787878787879, "grad_norm": 0.20667533576488495, "learning_rate": 3.8520000000000004e-05, "loss": 0.4353, "step": 1148 }, { "epoch": 104.48484848484848, "grad_norm": 0.20429357886314392, "learning_rate": 3.851e-05, "loss": 0.4358, "step": 1149 }, { "epoch": 104.58181818181818, "grad_norm": 0.2379342019557953, "learning_rate": 3.85e-05, "loss": 0.4862, "step": 1150 }, { "epoch": 104.58181818181818, "eval_loss": 0.5155452489852905, "eval_runtime": 2.1929, "eval_samples_per_second": 25.081, "eval_steps_per_second": 3.192, "step": 1150 }, { "epoch": 104.67878787878787, "grad_norm": 0.3056175112724304, "learning_rate": 3.8490000000000006e-05, "loss": 0.4735, "step": 1151 }, { "epoch": 104.77575757575758, "grad_norm": 0.31699028611183167, "learning_rate": 3.848e-05, "loss": 0.4544, "step": 1152 }, { "epoch": 104.87272727272727, "grad_norm": 0.24532760679721832, "learning_rate": 3.847e-05, "loss": 0.4553, "step": 1153 }, { "epoch": 104.96969696969697, "grad_norm": 0.22634100914001465, "learning_rate": 3.846e-05, "loss": 0.4921, "step": 1154 }, { "epoch": 105.0, "grad_norm": 0.4279729723930359, "learning_rate": 3.845e-05, "loss": 0.4548, "step": 1155 }, { "epoch": 105.0969696969697, "grad_norm": 0.27833861112594604, "learning_rate": 3.8440000000000005e-05, "loss": 0.4647, "step": 1156 }, { "epoch": 105.19393939393939, "grad_norm": 0.28585150837898254, "learning_rate": 3.8429999999999996e-05, "loss": 0.4608, "step": 1157 }, { "epoch": 105.2909090909091, "grad_norm": 0.23402553796768188, "learning_rate": 3.842e-05, "loss": 0.4832, "step": 1158 }, { "epoch": 105.38787878787879, "grad_norm": 0.23086602985858917, "learning_rate": 3.841e-05, "loss": 0.3979, "step": 1159 }, { "epoch": 105.48484848484848, "grad_norm": 0.2298888862133026, "learning_rate": 3.8400000000000005e-05, "loss": 0.4567, "step": 1160 }, { "epoch": 105.48484848484848, "eval_loss": 0.5147726535797119, "eval_runtime": 2.1655, "eval_samples_per_second": 25.398, "eval_steps_per_second": 3.232, "step": 1160 }, { "epoch": 105.58181818181818, "grad_norm": 0.3209298849105835, "learning_rate": 3.8390000000000003e-05, "loss": 0.4767, "step": 1161 }, { "epoch": 105.67878787878787, "grad_norm": 0.3022819459438324, "learning_rate": 3.838e-05, "loss": 0.4814, "step": 1162 }, { "epoch": 105.77575757575758, "grad_norm": 0.25658342242240906, "learning_rate": 3.837e-05, "loss": 0.4787, "step": 1163 }, { "epoch": 105.87272727272727, "grad_norm": 0.21234485507011414, "learning_rate": 3.836e-05, "loss": 0.4734, "step": 1164 }, { "epoch": 105.96969696969697, "grad_norm": 0.23950757086277008, "learning_rate": 3.8350000000000004e-05, "loss": 0.5033, "step": 1165 }, { "epoch": 106.0, "grad_norm": 0.4254544675350189, "learning_rate": 3.834e-05, "loss": 0.4682, "step": 1166 }, { "epoch": 106.0969696969697, "grad_norm": 0.21771714091300964, "learning_rate": 3.833e-05, "loss": 0.4728, "step": 1167 }, { "epoch": 106.19393939393939, "grad_norm": 0.2927077114582062, "learning_rate": 3.832e-05, "loss": 0.4664, "step": 1168 }, { "epoch": 106.2909090909091, "grad_norm": 0.2586457133293152, "learning_rate": 3.8310000000000004e-05, "loss": 0.4815, "step": 1169 }, { "epoch": 106.38787878787879, "grad_norm": 0.24301891028881073, "learning_rate": 3.83e-05, "loss": 0.4689, "step": 1170 }, { "epoch": 106.38787878787879, "eval_loss": 0.513548731803894, "eval_runtime": 2.1846, "eval_samples_per_second": 25.177, "eval_steps_per_second": 3.204, "step": 1170 }, { "epoch": 106.48484848484848, "grad_norm": 0.25214365124702454, "learning_rate": 3.829e-05, "loss": 0.455, "step": 1171 }, { "epoch": 106.58181818181818, "grad_norm": 0.28379714488983154, "learning_rate": 3.828e-05, "loss": 0.458, "step": 1172 }, { "epoch": 106.67878787878787, "grad_norm": 0.3102569282054901, "learning_rate": 3.827e-05, "loss": 0.4689, "step": 1173 }, { "epoch": 106.77575757575758, "grad_norm": 0.22973500192165375, "learning_rate": 3.826e-05, "loss": 0.4404, "step": 1174 }, { "epoch": 106.87272727272727, "grad_norm": 0.2559613585472107, "learning_rate": 3.825e-05, "loss": 0.4613, "step": 1175 }, { "epoch": 106.96969696969697, "grad_norm": 0.21462607383728027, "learning_rate": 3.8240000000000007e-05, "loss": 0.4623, "step": 1176 }, { "epoch": 107.0, "grad_norm": 0.4708159565925598, "learning_rate": 3.823e-05, "loss": 0.5488, "step": 1177 }, { "epoch": 107.0969696969697, "grad_norm": 0.24319447576999664, "learning_rate": 3.822e-05, "loss": 0.4784, "step": 1178 }, { "epoch": 107.19393939393939, "grad_norm": 0.32229968905448914, "learning_rate": 3.821e-05, "loss": 0.4585, "step": 1179 }, { "epoch": 107.2909090909091, "grad_norm": 0.2953487038612366, "learning_rate": 3.82e-05, "loss": 0.4459, "step": 1180 }, { "epoch": 107.2909090909091, "eval_loss": 0.5121553540229797, "eval_runtime": 2.177, "eval_samples_per_second": 25.264, "eval_steps_per_second": 3.215, "step": 1180 }, { "epoch": 107.38787878787879, "grad_norm": 0.2409444898366928, "learning_rate": 3.8190000000000005e-05, "loss": 0.4875, "step": 1181 }, { "epoch": 107.48484848484848, "grad_norm": 0.2681051194667816, "learning_rate": 3.818e-05, "loss": 0.4251, "step": 1182 }, { "epoch": 107.58181818181818, "grad_norm": 0.3105945289134979, "learning_rate": 3.817e-05, "loss": 0.4656, "step": 1183 }, { "epoch": 107.67878787878787, "grad_norm": 0.27236083149909973, "learning_rate": 3.816e-05, "loss": 0.4998, "step": 1184 }, { "epoch": 107.77575757575758, "grad_norm": 0.27155423164367676, "learning_rate": 3.8150000000000006e-05, "loss": 0.482, "step": 1185 }, { "epoch": 107.87272727272727, "grad_norm": 0.23208624124526978, "learning_rate": 3.8140000000000004e-05, "loss": 0.4508, "step": 1186 }, { "epoch": 107.96969696969697, "grad_norm": 0.2420530915260315, "learning_rate": 3.8129999999999996e-05, "loss": 0.4653, "step": 1187 }, { "epoch": 108.0, "grad_norm": 0.3872806131839752, "learning_rate": 3.812e-05, "loss": 0.423, "step": 1188 }, { "epoch": 108.0969696969697, "grad_norm": 0.29809868335723877, "learning_rate": 3.811e-05, "loss": 0.4195, "step": 1189 }, { "epoch": 108.19393939393939, "grad_norm": 0.2518772482872009, "learning_rate": 3.8100000000000005e-05, "loss": 0.4722, "step": 1190 }, { "epoch": 108.19393939393939, "eval_loss": 0.51094651222229, "eval_runtime": 2.1706, "eval_samples_per_second": 25.339, "eval_steps_per_second": 3.225, "step": 1190 }, { "epoch": 108.2909090909091, "grad_norm": 0.33083200454711914, "learning_rate": 3.809e-05, "loss": 0.434, "step": 1191 }, { "epoch": 108.38787878787879, "grad_norm": 0.2682957351207733, "learning_rate": 3.808e-05, "loss": 0.4792, "step": 1192 }, { "epoch": 108.48484848484848, "grad_norm": 0.2922620177268982, "learning_rate": 3.807e-05, "loss": 0.4703, "step": 1193 }, { "epoch": 108.58181818181818, "grad_norm": 0.33325645327568054, "learning_rate": 3.806e-05, "loss": 0.529, "step": 1194 }, { "epoch": 108.67878787878787, "grad_norm": 0.2711305022239685, "learning_rate": 3.805e-05, "loss": 0.4517, "step": 1195 }, { "epoch": 108.77575757575758, "grad_norm": 0.2560056149959564, "learning_rate": 3.804e-05, "loss": 0.4792, "step": 1196 }, { "epoch": 108.87272727272727, "grad_norm": 0.25109291076660156, "learning_rate": 3.803000000000001e-05, "loss": 0.4449, "step": 1197 }, { "epoch": 108.96969696969697, "grad_norm": 0.23504619300365448, "learning_rate": 3.802e-05, "loss": 0.466, "step": 1198 }, { "epoch": 109.0, "grad_norm": 0.45923951268196106, "learning_rate": 3.8010000000000004e-05, "loss": 0.406, "step": 1199 }, { "epoch": 109.0969696969697, "grad_norm": 0.2102024257183075, "learning_rate": 3.8e-05, "loss": 0.4256, "step": 1200 }, { "epoch": 109.0969696969697, "eval_loss": 0.5096712112426758, "eval_runtime": 2.183, "eval_samples_per_second": 25.195, "eval_steps_per_second": 3.207, "step": 1200 }, { "epoch": 109.19393939393939, "grad_norm": 0.24149499833583832, "learning_rate": 3.799e-05, "loss": 0.4755, "step": 1201 }, { "epoch": 109.2909090909091, "grad_norm": 0.20073340833187103, "learning_rate": 3.7980000000000006e-05, "loss": 0.4245, "step": 1202 }, { "epoch": 109.38787878787879, "grad_norm": 0.24485673010349274, "learning_rate": 3.797e-05, "loss": 0.4567, "step": 1203 }, { "epoch": 109.48484848484848, "grad_norm": 0.26820191740989685, "learning_rate": 3.796e-05, "loss": 0.4445, "step": 1204 }, { "epoch": 109.58181818181818, "grad_norm": 0.24449220299720764, "learning_rate": 3.795e-05, "loss": 0.5022, "step": 1205 }, { "epoch": 109.67878787878787, "grad_norm": 0.259132444858551, "learning_rate": 3.7940000000000006e-05, "loss": 0.4861, "step": 1206 }, { "epoch": 109.77575757575758, "grad_norm": 0.28169506788253784, "learning_rate": 3.7930000000000004e-05, "loss": 0.4712, "step": 1207 }, { "epoch": 109.87272727272727, "grad_norm": 0.2922845780849457, "learning_rate": 3.792e-05, "loss": 0.4567, "step": 1208 }, { "epoch": 109.96969696969697, "grad_norm": 0.25783708691596985, "learning_rate": 3.791e-05, "loss": 0.4752, "step": 1209 }, { "epoch": 110.0, "grad_norm": 0.38848626613616943, "learning_rate": 3.79e-05, "loss": 0.4511, "step": 1210 }, { "epoch": 110.0, "eval_loss": 0.5089406967163086, "eval_runtime": 2.178, "eval_samples_per_second": 25.253, "eval_steps_per_second": 3.214, "step": 1210 }, { "epoch": 110.0969696969697, "grad_norm": 0.24029256403446198, "learning_rate": 3.7890000000000005e-05, "loss": 0.4672, "step": 1211 }, { "epoch": 110.19393939393939, "grad_norm": 0.25455084443092346, "learning_rate": 3.788e-05, "loss": 0.4671, "step": 1212 }, { "epoch": 110.2909090909091, "grad_norm": 0.29211434721946716, "learning_rate": 3.787e-05, "loss": 0.4523, "step": 1213 }, { "epoch": 110.38787878787879, "grad_norm": 0.31663522124290466, "learning_rate": 3.786e-05, "loss": 0.4581, "step": 1214 }, { "epoch": 110.48484848484848, "grad_norm": 0.23833386600017548, "learning_rate": 3.7850000000000005e-05, "loss": 0.4409, "step": 1215 }, { "epoch": 110.58181818181818, "grad_norm": 0.3347303569316864, "learning_rate": 3.7840000000000004e-05, "loss": 0.512, "step": 1216 }, { "epoch": 110.67878787878787, "grad_norm": 0.26010632514953613, "learning_rate": 3.783e-05, "loss": 0.4411, "step": 1217 }, { "epoch": 110.77575757575758, "grad_norm": 0.27888065576553345, "learning_rate": 3.782e-05, "loss": 0.4628, "step": 1218 }, { "epoch": 110.87272727272727, "grad_norm": 0.2746562957763672, "learning_rate": 3.781e-05, "loss": 0.4374, "step": 1219 }, { "epoch": 110.96969696969697, "grad_norm": 0.27418023347854614, "learning_rate": 3.7800000000000004e-05, "loss": 0.4701, "step": 1220 }, { "epoch": 110.96969696969697, "eval_loss": 0.5076990723609924, "eval_runtime": 2.1853, "eval_samples_per_second": 25.168, "eval_steps_per_second": 3.203, "step": 1220 }, { "epoch": 111.0, "grad_norm": 0.4324119985103607, "learning_rate": 3.779e-05, "loss": 0.4461, "step": 1221 }, { "epoch": 111.0969696969697, "grad_norm": 0.26746389269828796, "learning_rate": 3.778000000000001e-05, "loss": 0.4694, "step": 1222 }, { "epoch": 111.19393939393939, "grad_norm": 0.2583480477333069, "learning_rate": 3.777e-05, "loss": 0.459, "step": 1223 }, { "epoch": 111.2909090909091, "grad_norm": 0.25809216499328613, "learning_rate": 3.776e-05, "loss": 0.4464, "step": 1224 }, { "epoch": 111.38787878787879, "grad_norm": 0.2889714539051056, "learning_rate": 3.775e-05, "loss": 0.4636, "step": 1225 }, { "epoch": 111.48484848484848, "grad_norm": 0.30283504724502563, "learning_rate": 3.774e-05, "loss": 0.4838, "step": 1226 }, { "epoch": 111.58181818181818, "grad_norm": 0.24876466393470764, "learning_rate": 3.7730000000000006e-05, "loss": 0.4623, "step": 1227 }, { "epoch": 111.67878787878787, "grad_norm": 0.23016254603862762, "learning_rate": 3.772e-05, "loss": 0.4898, "step": 1228 }, { "epoch": 111.77575757575758, "grad_norm": 0.2642112970352173, "learning_rate": 3.771e-05, "loss": 0.4576, "step": 1229 }, { "epoch": 111.87272727272727, "grad_norm": 0.24856284260749817, "learning_rate": 3.77e-05, "loss": 0.4255, "step": 1230 }, { "epoch": 111.87272727272727, "eval_loss": 0.5072154998779297, "eval_runtime": 2.1804, "eval_samples_per_second": 25.225, "eval_steps_per_second": 3.21, "step": 1230 }, { "epoch": 111.96969696969697, "grad_norm": 0.2232726514339447, "learning_rate": 3.769e-05, "loss": 0.4518, "step": 1231 }, { "epoch": 112.0, "grad_norm": 0.38369572162628174, "learning_rate": 3.7680000000000005e-05, "loss": 0.3925, "step": 1232 }, { "epoch": 112.0969696969697, "grad_norm": 0.2862841784954071, "learning_rate": 3.767e-05, "loss": 0.4098, "step": 1233 }, { "epoch": 112.19393939393939, "grad_norm": 0.28138405084609985, "learning_rate": 3.766e-05, "loss": 0.5203, "step": 1234 }, { "epoch": 112.2909090909091, "grad_norm": 0.26404833793640137, "learning_rate": 3.765e-05, "loss": 0.4698, "step": 1235 }, { "epoch": 112.38787878787879, "grad_norm": 0.21245059370994568, "learning_rate": 3.7640000000000006e-05, "loss": 0.4848, "step": 1236 }, { "epoch": 112.48484848484848, "grad_norm": 0.2753354609012604, "learning_rate": 3.7630000000000004e-05, "loss": 0.4127, "step": 1237 }, { "epoch": 112.58181818181818, "grad_norm": 0.3698137700557709, "learning_rate": 3.762e-05, "loss": 0.3973, "step": 1238 }, { "epoch": 112.67878787878787, "grad_norm": 0.23089006543159485, "learning_rate": 3.761e-05, "loss": 0.4356, "step": 1239 }, { "epoch": 112.77575757575758, "grad_norm": 0.24908600747585297, "learning_rate": 3.76e-05, "loss": 0.4809, "step": 1240 }, { "epoch": 112.77575757575758, "eval_loss": 0.5066096782684326, "eval_runtime": 2.1764, "eval_samples_per_second": 25.271, "eval_steps_per_second": 3.216, "step": 1240 }, { "epoch": 112.87272727272727, "grad_norm": 0.2229243665933609, "learning_rate": 3.7590000000000004e-05, "loss": 0.4359, "step": 1241 }, { "epoch": 112.96969696969697, "grad_norm": 0.3743974268436432, "learning_rate": 3.758e-05, "loss": 0.4945, "step": 1242 }, { "epoch": 113.0, "grad_norm": 0.6298211812973022, "learning_rate": 3.757e-05, "loss": 0.5496, "step": 1243 }, { "epoch": 113.0969696969697, "grad_norm": 0.3047841787338257, "learning_rate": 3.756e-05, "loss": 0.4694, "step": 1244 }, { "epoch": 113.19393939393939, "grad_norm": 0.22005103528499603, "learning_rate": 3.7550000000000005e-05, "loss": 0.4298, "step": 1245 }, { "epoch": 113.2909090909091, "grad_norm": 0.265865296125412, "learning_rate": 3.754e-05, "loss": 0.4511, "step": 1246 }, { "epoch": 113.38787878787879, "grad_norm": 0.2695404887199402, "learning_rate": 3.753e-05, "loss": 0.4889, "step": 1247 }, { "epoch": 113.48484848484848, "grad_norm": 0.26106733083724976, "learning_rate": 3.752e-05, "loss": 0.4412, "step": 1248 }, { "epoch": 113.58181818181818, "grad_norm": 0.27255240082740784, "learning_rate": 3.751e-05, "loss": 0.4569, "step": 1249 }, { "epoch": 113.67878787878787, "grad_norm": 0.23935070633888245, "learning_rate": 3.7500000000000003e-05, "loss": 0.45, "step": 1250 }, { "epoch": 113.67878787878787, "eval_loss": 0.5050410032272339, "eval_runtime": 2.1859, "eval_samples_per_second": 25.161, "eval_steps_per_second": 3.202, "step": 1250 }, { "epoch": 113.77575757575758, "grad_norm": 0.25609126687049866, "learning_rate": 3.749e-05, "loss": 0.4751, "step": 1251 }, { "epoch": 113.87272727272727, "grad_norm": 0.28254762291908264, "learning_rate": 3.748000000000001e-05, "loss": 0.4178, "step": 1252 }, { "epoch": 113.96969696969697, "grad_norm": 0.2685804069042206, "learning_rate": 3.747e-05, "loss": 0.4401, "step": 1253 }, { "epoch": 114.0, "grad_norm": 0.6137757301330566, "learning_rate": 3.7460000000000004e-05, "loss": 0.5856, "step": 1254 }, { "epoch": 114.0969696969697, "grad_norm": 0.25804847478866577, "learning_rate": 3.745e-05, "loss": 0.4107, "step": 1255 }, { "epoch": 114.19393939393939, "grad_norm": 0.24832597374916077, "learning_rate": 3.744e-05, "loss": 0.5157, "step": 1256 }, { "epoch": 114.2909090909091, "grad_norm": 0.30176836252212524, "learning_rate": 3.7430000000000006e-05, "loss": 0.4617, "step": 1257 }, { "epoch": 114.38787878787879, "grad_norm": 0.2873697280883789, "learning_rate": 3.742e-05, "loss": 0.4844, "step": 1258 }, { "epoch": 114.48484848484848, "grad_norm": 0.3384517431259155, "learning_rate": 3.741e-05, "loss": 0.4438, "step": 1259 }, { "epoch": 114.58181818181818, "grad_norm": 0.26792243123054504, "learning_rate": 3.74e-05, "loss": 0.4231, "step": 1260 }, { "epoch": 114.58181818181818, "eval_loss": 0.5039417147636414, "eval_runtime": 2.1877, "eval_samples_per_second": 25.14, "eval_steps_per_second": 3.2, "step": 1260 }, { "epoch": 114.67878787878787, "grad_norm": 0.22581951320171356, "learning_rate": 3.739e-05, "loss": 0.4391, "step": 1261 }, { "epoch": 114.77575757575758, "grad_norm": 0.27811703085899353, "learning_rate": 3.7380000000000005e-05, "loss": 0.4762, "step": 1262 }, { "epoch": 114.87272727272727, "grad_norm": 0.28211668133735657, "learning_rate": 3.7369999999999996e-05, "loss": 0.4486, "step": 1263 }, { "epoch": 114.96969696969697, "grad_norm": 0.35375770926475525, "learning_rate": 3.736e-05, "loss": 0.4255, "step": 1264 }, { "epoch": 115.0, "grad_norm": 0.40355023741722107, "learning_rate": 3.735e-05, "loss": 0.5068, "step": 1265 }, { "epoch": 115.0969696969697, "grad_norm": 0.22719883918762207, "learning_rate": 3.7340000000000005e-05, "loss": 0.4527, "step": 1266 }, { "epoch": 115.19393939393939, "grad_norm": 0.2507949471473694, "learning_rate": 3.7330000000000003e-05, "loss": 0.4177, "step": 1267 }, { "epoch": 115.2909090909091, "grad_norm": 0.3059385120868683, "learning_rate": 3.732e-05, "loss": 0.4506, "step": 1268 }, { "epoch": 115.38787878787879, "grad_norm": 0.24009016156196594, "learning_rate": 3.731e-05, "loss": 0.4618, "step": 1269 }, { "epoch": 115.48484848484848, "grad_norm": 0.2662716507911682, "learning_rate": 3.73e-05, "loss": 0.4522, "step": 1270 }, { "epoch": 115.48484848484848, "eval_loss": 0.5031892657279968, "eval_runtime": 2.1723, "eval_samples_per_second": 25.318, "eval_steps_per_second": 3.222, "step": 1270 }, { "epoch": 115.58181818181818, "grad_norm": 0.2552890181541443, "learning_rate": 3.7290000000000004e-05, "loss": 0.4387, "step": 1271 }, { "epoch": 115.67878787878787, "grad_norm": 0.304675430059433, "learning_rate": 3.728e-05, "loss": 0.5211, "step": 1272 }, { "epoch": 115.77575757575758, "grad_norm": 0.3080827593803406, "learning_rate": 3.727e-05, "loss": 0.4216, "step": 1273 }, { "epoch": 115.87272727272727, "grad_norm": 0.2266256958246231, "learning_rate": 3.726e-05, "loss": 0.4419, "step": 1274 }, { "epoch": 115.96969696969697, "grad_norm": 0.3163573145866394, "learning_rate": 3.7250000000000004e-05, "loss": 0.475, "step": 1275 }, { "epoch": 116.0, "grad_norm": 0.36579272150993347, "learning_rate": 3.724e-05, "loss": 0.4423, "step": 1276 }, { "epoch": 116.0969696969697, "grad_norm": 0.25317344069480896, "learning_rate": 3.723e-05, "loss": 0.4364, "step": 1277 }, { "epoch": 116.19393939393939, "grad_norm": 0.23825958371162415, "learning_rate": 3.722e-05, "loss": 0.4653, "step": 1278 }, { "epoch": 116.2909090909091, "grad_norm": 0.27158665657043457, "learning_rate": 3.721e-05, "loss": 0.4417, "step": 1279 }, { "epoch": 116.38787878787879, "grad_norm": 0.32222259044647217, "learning_rate": 3.72e-05, "loss": 0.4636, "step": 1280 }, { "epoch": 116.38787878787879, "eval_loss": 0.5025956034660339, "eval_runtime": 2.1635, "eval_samples_per_second": 25.422, "eval_steps_per_second": 3.236, "step": 1280 }, { "epoch": 116.48484848484848, "grad_norm": 0.2667633593082428, "learning_rate": 3.719e-05, "loss": 0.4709, "step": 1281 }, { "epoch": 116.58181818181818, "grad_norm": 0.2588888108730316, "learning_rate": 3.7180000000000007e-05, "loss": 0.4164, "step": 1282 }, { "epoch": 116.67878787878787, "grad_norm": 0.2974129617214203, "learning_rate": 3.717e-05, "loss": 0.432, "step": 1283 }, { "epoch": 116.77575757575758, "grad_norm": 0.24473229050636292, "learning_rate": 3.716e-05, "loss": 0.4942, "step": 1284 }, { "epoch": 116.87272727272727, "grad_norm": 0.2632085084915161, "learning_rate": 3.715e-05, "loss": 0.4465, "step": 1285 }, { "epoch": 116.96969696969697, "grad_norm": 0.29160937666893005, "learning_rate": 3.714e-05, "loss": 0.442, "step": 1286 }, { "epoch": 117.0, "grad_norm": 0.40403980016708374, "learning_rate": 3.7130000000000005e-05, "loss": 0.4771, "step": 1287 }, { "epoch": 117.0969696969697, "grad_norm": 0.3258376717567444, "learning_rate": 3.712e-05, "loss": 0.4886, "step": 1288 }, { "epoch": 117.19393939393939, "grad_norm": 0.24444925785064697, "learning_rate": 3.711e-05, "loss": 0.4539, "step": 1289 }, { "epoch": 117.2909090909091, "grad_norm": 0.24981863796710968, "learning_rate": 3.71e-05, "loss": 0.4257, "step": 1290 }, { "epoch": 117.2909090909091, "eval_loss": 0.5017372369766235, "eval_runtime": 2.1651, "eval_samples_per_second": 25.403, "eval_steps_per_second": 3.233, "step": 1290 }, { "epoch": 117.38787878787879, "grad_norm": 0.22072580456733704, "learning_rate": 3.7090000000000006e-05, "loss": 0.4551, "step": 1291 }, { "epoch": 117.48484848484848, "grad_norm": 0.24209479987621307, "learning_rate": 3.7080000000000004e-05, "loss": 0.4591, "step": 1292 }, { "epoch": 117.58181818181818, "grad_norm": 0.2738741934299469, "learning_rate": 3.707e-05, "loss": 0.4762, "step": 1293 }, { "epoch": 117.67878787878787, "grad_norm": 0.22845780849456787, "learning_rate": 3.706e-05, "loss": 0.4312, "step": 1294 }, { "epoch": 117.77575757575758, "grad_norm": 0.2801177203655243, "learning_rate": 3.705e-05, "loss": 0.452, "step": 1295 }, { "epoch": 117.87272727272727, "grad_norm": 0.25144457817077637, "learning_rate": 3.7040000000000005e-05, "loss": 0.4597, "step": 1296 }, { "epoch": 117.96969696969697, "grad_norm": 0.22239412367343903, "learning_rate": 3.703e-05, "loss": 0.4181, "step": 1297 }, { "epoch": 118.0, "grad_norm": 0.4170636832714081, "learning_rate": 3.702e-05, "loss": 0.4162, "step": 1298 }, { "epoch": 118.0969696969697, "grad_norm": 0.2747899889945984, "learning_rate": 3.701e-05, "loss": 0.4957, "step": 1299 }, { "epoch": 118.19393939393939, "grad_norm": 0.31099769473075867, "learning_rate": 3.7e-05, "loss": 0.4575, "step": 1300 }, { "epoch": 118.19393939393939, "eval_loss": 0.500615119934082, "eval_runtime": 2.1657, "eval_samples_per_second": 25.396, "eval_steps_per_second": 3.232, "step": 1300 }, { "epoch": 118.2909090909091, "grad_norm": 0.3002060055732727, "learning_rate": 3.699e-05, "loss": 0.4245, "step": 1301 }, { "epoch": 118.38787878787879, "grad_norm": 0.21696718037128448, "learning_rate": 3.698e-05, "loss": 0.5023, "step": 1302 }, { "epoch": 118.48484848484848, "grad_norm": 0.2654174566268921, "learning_rate": 3.697e-05, "loss": 0.4316, "step": 1303 }, { "epoch": 118.58181818181818, "grad_norm": 0.28204402327537537, "learning_rate": 3.696e-05, "loss": 0.4399, "step": 1304 }, { "epoch": 118.67878787878787, "grad_norm": 0.25560328364372253, "learning_rate": 3.6950000000000004e-05, "loss": 0.3992, "step": 1305 }, { "epoch": 118.77575757575758, "grad_norm": 0.2698306441307068, "learning_rate": 3.694e-05, "loss": 0.4552, "step": 1306 }, { "epoch": 118.87272727272727, "grad_norm": 0.26093652844429016, "learning_rate": 3.693e-05, "loss": 0.4587, "step": 1307 }, { "epoch": 118.96969696969697, "grad_norm": 0.30971360206604004, "learning_rate": 3.692e-05, "loss": 0.4194, "step": 1308 }, { "epoch": 119.0, "grad_norm": 0.4734983444213867, "learning_rate": 3.691e-05, "loss": 0.4719, "step": 1309 }, { "epoch": 119.0969696969697, "grad_norm": 0.32670915126800537, "learning_rate": 3.69e-05, "loss": 0.4586, "step": 1310 }, { "epoch": 119.0969696969697, "eval_loss": 0.5000902414321899, "eval_runtime": 2.2185, "eval_samples_per_second": 24.792, "eval_steps_per_second": 3.155, "step": 1310 }, { "epoch": 119.19393939393939, "grad_norm": 0.22537638247013092, "learning_rate": 3.689e-05, "loss": 0.477, "step": 1311 }, { "epoch": 119.2909090909091, "grad_norm": 0.258887380361557, "learning_rate": 3.6880000000000006e-05, "loss": 0.4402, "step": 1312 }, { "epoch": 119.38787878787879, "grad_norm": 0.26598671078681946, "learning_rate": 3.6870000000000004e-05, "loss": 0.4254, "step": 1313 }, { "epoch": 119.48484848484848, "grad_norm": 0.23355336487293243, "learning_rate": 3.686e-05, "loss": 0.4387, "step": 1314 }, { "epoch": 119.58181818181818, "grad_norm": 0.2759830355644226, "learning_rate": 3.685e-05, "loss": 0.4834, "step": 1315 }, { "epoch": 119.67878787878787, "grad_norm": 0.24471092224121094, "learning_rate": 3.684e-05, "loss": 0.4632, "step": 1316 }, { "epoch": 119.77575757575758, "grad_norm": 0.23647384345531464, "learning_rate": 3.6830000000000005e-05, "loss": 0.3989, "step": 1317 }, { "epoch": 119.87272727272727, "grad_norm": 0.2802410125732422, "learning_rate": 3.682e-05, "loss": 0.4359, "step": 1318 }, { "epoch": 119.96969696969697, "grad_norm": 0.21583354473114014, "learning_rate": 3.681e-05, "loss": 0.4621, "step": 1319 }, { "epoch": 120.0, "grad_norm": 0.40480583906173706, "learning_rate": 3.68e-05, "loss": 0.4294, "step": 1320 }, { "epoch": 120.0, "eval_loss": 0.4989987015724182, "eval_runtime": 2.138, "eval_samples_per_second": 25.725, "eval_steps_per_second": 3.274, "step": 1320 }, { "epoch": 120.0969696969697, "grad_norm": 0.3164379894733429, "learning_rate": 3.6790000000000005e-05, "loss": 0.4802, "step": 1321 }, { "epoch": 120.19393939393939, "grad_norm": 0.31919732689857483, "learning_rate": 3.6780000000000004e-05, "loss": 0.4447, "step": 1322 }, { "epoch": 120.2909090909091, "grad_norm": 0.29702770709991455, "learning_rate": 3.677e-05, "loss": 0.4632, "step": 1323 }, { "epoch": 120.38787878787879, "grad_norm": 0.20388232171535492, "learning_rate": 3.676e-05, "loss": 0.4442, "step": 1324 }, { "epoch": 120.48484848484848, "grad_norm": 0.22363580763339996, "learning_rate": 3.675e-05, "loss": 0.4455, "step": 1325 }, { "epoch": 120.58181818181818, "grad_norm": 0.24787390232086182, "learning_rate": 3.6740000000000004e-05, "loss": 0.4437, "step": 1326 }, { "epoch": 120.67878787878787, "grad_norm": 0.23430295288562775, "learning_rate": 3.673e-05, "loss": 0.3896, "step": 1327 }, { "epoch": 120.77575757575758, "grad_norm": 0.24690541625022888, "learning_rate": 3.672000000000001e-05, "loss": 0.4307, "step": 1328 }, { "epoch": 120.87272727272727, "grad_norm": 0.3323303759098053, "learning_rate": 3.671e-05, "loss": 0.4549, "step": 1329 }, { "epoch": 120.96969696969697, "grad_norm": 0.29622218012809753, "learning_rate": 3.6700000000000004e-05, "loss": 0.4643, "step": 1330 }, { "epoch": 120.96969696969697, "eval_loss": 0.4983088970184326, "eval_runtime": 2.1516, "eval_samples_per_second": 25.563, "eval_steps_per_second": 3.253, "step": 1330 }, { "epoch": 121.0, "grad_norm": 0.5410318970680237, "learning_rate": 3.669e-05, "loss": 0.468, "step": 1331 }, { "epoch": 121.0969696969697, "grad_norm": 0.20910969376564026, "learning_rate": 3.668e-05, "loss": 0.4611, "step": 1332 }, { "epoch": 121.19393939393939, "grad_norm": 0.39248064160346985, "learning_rate": 3.6670000000000006e-05, "loss": 0.3951, "step": 1333 }, { "epoch": 121.2909090909091, "grad_norm": 0.33087027072906494, "learning_rate": 3.666e-05, "loss": 0.4323, "step": 1334 }, { "epoch": 121.38787878787879, "grad_norm": 0.4120776951313019, "learning_rate": 3.665e-05, "loss": 0.4498, "step": 1335 }, { "epoch": 121.48484848484848, "grad_norm": 0.2515096962451935, "learning_rate": 3.664e-05, "loss": 0.4624, "step": 1336 }, { "epoch": 121.58181818181818, "grad_norm": 0.28713738918304443, "learning_rate": 3.663e-05, "loss": 0.4362, "step": 1337 }, { "epoch": 121.67878787878787, "grad_norm": 0.3005863130092621, "learning_rate": 3.6620000000000005e-05, "loss": 0.4994, "step": 1338 }, { "epoch": 121.77575757575758, "grad_norm": 0.3588126301765442, "learning_rate": 3.661e-05, "loss": 0.4499, "step": 1339 }, { "epoch": 121.87272727272727, "grad_norm": 0.34536463022232056, "learning_rate": 3.66e-05, "loss": 0.4284, "step": 1340 }, { "epoch": 121.87272727272727, "eval_loss": 0.49804064631462097, "eval_runtime": 2.1636, "eval_samples_per_second": 25.421, "eval_steps_per_second": 3.235, "step": 1340 }, { "epoch": 121.96969696969697, "grad_norm": 0.28389716148376465, "learning_rate": 3.659e-05, "loss": 0.4487, "step": 1341 }, { "epoch": 122.0, "grad_norm": 0.4554291367530823, "learning_rate": 3.6580000000000006e-05, "loss": 0.4294, "step": 1342 }, { "epoch": 122.0969696969697, "grad_norm": 0.24930481612682343, "learning_rate": 3.6570000000000004e-05, "loss": 0.4316, "step": 1343 }, { "epoch": 122.19393939393939, "grad_norm": 0.25624746084213257, "learning_rate": 3.656e-05, "loss": 0.4593, "step": 1344 }, { "epoch": 122.2909090909091, "grad_norm": 0.28764474391937256, "learning_rate": 3.655e-05, "loss": 0.4388, "step": 1345 }, { "epoch": 122.38787878787879, "grad_norm": 0.23803141713142395, "learning_rate": 3.654e-05, "loss": 0.4672, "step": 1346 }, { "epoch": 122.48484848484848, "grad_norm": 0.4183109700679779, "learning_rate": 3.6530000000000004e-05, "loss": 0.453, "step": 1347 }, { "epoch": 122.58181818181818, "grad_norm": 0.3076031506061554, "learning_rate": 3.652e-05, "loss": 0.4322, "step": 1348 }, { "epoch": 122.67878787878787, "grad_norm": 0.25950199365615845, "learning_rate": 3.651e-05, "loss": 0.4452, "step": 1349 }, { "epoch": 122.77575757575758, "grad_norm": 0.27765706181526184, "learning_rate": 3.65e-05, "loss": 0.4321, "step": 1350 }, { "epoch": 122.77575757575758, "eval_loss": 0.49684277176856995, "eval_runtime": 2.17, "eval_samples_per_second": 25.345, "eval_steps_per_second": 3.226, "step": 1350 }, { "epoch": 122.87272727272727, "grad_norm": 0.3624914288520813, "learning_rate": 3.6490000000000005e-05, "loss": 0.4507, "step": 1351 }, { "epoch": 122.96969696969697, "grad_norm": 0.29541799426078796, "learning_rate": 3.648e-05, "loss": 0.4287, "step": 1352 }, { "epoch": 123.0, "grad_norm": 0.4932088255882263, "learning_rate": 3.647e-05, "loss": 0.4679, "step": 1353 }, { "epoch": 123.0969696969697, "grad_norm": 0.4175611138343811, "learning_rate": 3.646e-05, "loss": 0.4432, "step": 1354 }, { "epoch": 123.19393939393939, "grad_norm": 0.3209320902824402, "learning_rate": 3.645e-05, "loss": 0.4445, "step": 1355 }, { "epoch": 123.2909090909091, "grad_norm": 0.378221720457077, "learning_rate": 3.6440000000000003e-05, "loss": 0.4699, "step": 1356 }, { "epoch": 123.38787878787879, "grad_norm": 0.3122067451477051, "learning_rate": 3.643e-05, "loss": 0.4392, "step": 1357 }, { "epoch": 123.48484848484848, "grad_norm": 0.25616055727005005, "learning_rate": 3.642000000000001e-05, "loss": 0.4524, "step": 1358 }, { "epoch": 123.58181818181818, "grad_norm": 0.27498358488082886, "learning_rate": 3.641e-05, "loss": 0.4341, "step": 1359 }, { "epoch": 123.67878787878787, "grad_norm": 0.25121861696243286, "learning_rate": 3.6400000000000004e-05, "loss": 0.4653, "step": 1360 }, { "epoch": 123.67878787878787, "eval_loss": 0.4961063265800476, "eval_runtime": 2.1514, "eval_samples_per_second": 25.565, "eval_steps_per_second": 3.254, "step": 1360 }, { "epoch": 123.77575757575758, "grad_norm": 0.3191700875759125, "learning_rate": 3.639e-05, "loss": 0.4405, "step": 1361 }, { "epoch": 123.87272727272727, "grad_norm": 0.30171266198158264, "learning_rate": 3.638e-05, "loss": 0.4325, "step": 1362 }, { "epoch": 123.96969696969697, "grad_norm": 0.3361068069934845, "learning_rate": 3.6370000000000006e-05, "loss": 0.4203, "step": 1363 }, { "epoch": 124.0, "grad_norm": 0.4331413209438324, "learning_rate": 3.636e-05, "loss": 0.4128, "step": 1364 }, { "epoch": 124.0969696969697, "grad_norm": 0.2644399106502533, "learning_rate": 3.635e-05, "loss": 0.4806, "step": 1365 }, { "epoch": 124.19393939393939, "grad_norm": 0.24663595855236053, "learning_rate": 3.634e-05, "loss": 0.4547, "step": 1366 }, { "epoch": 124.2909090909091, "grad_norm": 0.2665797770023346, "learning_rate": 3.6330000000000006e-05, "loss": 0.4051, "step": 1367 }, { "epoch": 124.38787878787879, "grad_norm": 0.3389240801334381, "learning_rate": 3.6320000000000005e-05, "loss": 0.4469, "step": 1368 }, { "epoch": 124.48484848484848, "grad_norm": 0.2690924108028412, "learning_rate": 3.6309999999999996e-05, "loss": 0.4341, "step": 1369 }, { "epoch": 124.58181818181818, "grad_norm": 0.24461796879768372, "learning_rate": 3.63e-05, "loss": 0.4346, "step": 1370 }, { "epoch": 124.58181818181818, "eval_loss": 0.4951164722442627, "eval_runtime": 2.1649, "eval_samples_per_second": 25.406, "eval_steps_per_second": 3.233, "step": 1370 }, { "epoch": 124.67878787878787, "grad_norm": 0.3070378601551056, "learning_rate": 3.629e-05, "loss": 0.4971, "step": 1371 }, { "epoch": 124.77575757575758, "grad_norm": 0.2280680537223816, "learning_rate": 3.6280000000000005e-05, "loss": 0.4394, "step": 1372 }, { "epoch": 124.87272727272727, "grad_norm": 0.2670609652996063, "learning_rate": 3.6270000000000003e-05, "loss": 0.4052, "step": 1373 }, { "epoch": 124.96969696969697, "grad_norm": 0.2442789077758789, "learning_rate": 3.626e-05, "loss": 0.4269, "step": 1374 }, { "epoch": 125.0, "grad_norm": 0.38368964195251465, "learning_rate": 3.625e-05, "loss": 0.4233, "step": 1375 }, { "epoch": 125.0969696969697, "grad_norm": 0.23206835985183716, "learning_rate": 3.624e-05, "loss": 0.4322, "step": 1376 }, { "epoch": 125.19393939393939, "grad_norm": 0.27216339111328125, "learning_rate": 3.6230000000000004e-05, "loss": 0.4493, "step": 1377 }, { "epoch": 125.2909090909091, "grad_norm": 0.26469945907592773, "learning_rate": 3.622e-05, "loss": 0.4055, "step": 1378 }, { "epoch": 125.38787878787879, "grad_norm": 0.2685600221157074, "learning_rate": 3.621e-05, "loss": 0.4496, "step": 1379 }, { "epoch": 125.48484848484848, "grad_norm": 0.2712048888206482, "learning_rate": 3.62e-05, "loss": 0.4677, "step": 1380 }, { "epoch": 125.48484848484848, "eval_loss": 0.4940645396709442, "eval_runtime": 2.1505, "eval_samples_per_second": 25.575, "eval_steps_per_second": 3.255, "step": 1380 }, { "epoch": 125.58181818181818, "grad_norm": 0.3111526668071747, "learning_rate": 3.6190000000000004e-05, "loss": 0.4701, "step": 1381 }, { "epoch": 125.67878787878787, "grad_norm": 0.2506205141544342, "learning_rate": 3.618e-05, "loss": 0.4498, "step": 1382 }, { "epoch": 125.77575757575758, "grad_norm": 0.347450315952301, "learning_rate": 3.617e-05, "loss": 0.4418, "step": 1383 }, { "epoch": 125.87272727272727, "grad_norm": 0.3053918778896332, "learning_rate": 3.616e-05, "loss": 0.4602, "step": 1384 }, { "epoch": 125.96969696969697, "grad_norm": 0.25132766366004944, "learning_rate": 3.615e-05, "loss": 0.4036, "step": 1385 }, { "epoch": 126.0, "grad_norm": 0.41760462522506714, "learning_rate": 3.614e-05, "loss": 0.3651, "step": 1386 }, { "epoch": 126.0969696969697, "grad_norm": 0.3355599045753479, "learning_rate": 3.613e-05, "loss": 0.4767, "step": 1387 }, { "epoch": 126.19393939393939, "grad_norm": 0.29463788866996765, "learning_rate": 3.6120000000000007e-05, "loss": 0.4493, "step": 1388 }, { "epoch": 126.2909090909091, "grad_norm": 0.3120037317276001, "learning_rate": 3.611e-05, "loss": 0.4589, "step": 1389 }, { "epoch": 126.38787878787879, "grad_norm": 0.22270894050598145, "learning_rate": 3.61e-05, "loss": 0.4225, "step": 1390 }, { "epoch": 126.38787878787879, "eval_loss": 0.4936691224575043, "eval_runtime": 2.1584, "eval_samples_per_second": 25.481, "eval_steps_per_second": 3.243, "step": 1390 }, { "epoch": 126.48484848484848, "grad_norm": 0.2702427804470062, "learning_rate": 3.609e-05, "loss": 0.4303, "step": 1391 }, { "epoch": 126.58181818181818, "grad_norm": 0.2349555343389511, "learning_rate": 3.608e-05, "loss": 0.4264, "step": 1392 }, { "epoch": 126.67878787878787, "grad_norm": 0.23798425495624542, "learning_rate": 3.6070000000000005e-05, "loss": 0.4566, "step": 1393 }, { "epoch": 126.77575757575758, "grad_norm": 0.2422553449869156, "learning_rate": 3.606e-05, "loss": 0.4078, "step": 1394 }, { "epoch": 126.87272727272727, "grad_norm": 0.29934629797935486, "learning_rate": 3.605e-05, "loss": 0.3834, "step": 1395 }, { "epoch": 126.96969696969697, "grad_norm": 0.337470680475235, "learning_rate": 3.604e-05, "loss": 0.4824, "step": 1396 }, { "epoch": 127.0, "grad_norm": 0.44956621527671814, "learning_rate": 3.6030000000000006e-05, "loss": 0.428, "step": 1397 }, { "epoch": 127.0969696969697, "grad_norm": 0.2476656585931778, "learning_rate": 3.6020000000000004e-05, "loss": 0.3799, "step": 1398 }, { "epoch": 127.19393939393939, "grad_norm": 0.3032231628894806, "learning_rate": 3.601e-05, "loss": 0.4688, "step": 1399 }, { "epoch": 127.2909090909091, "grad_norm": 0.21109527349472046, "learning_rate": 3.6e-05, "loss": 0.3841, "step": 1400 }, { "epoch": 127.2909090909091, "eval_loss": 0.4930807948112488, "eval_runtime": 2.161, "eval_samples_per_second": 25.451, "eval_steps_per_second": 3.239, "step": 1400 }, { "epoch": 127.38787878787879, "grad_norm": 0.3381328284740448, "learning_rate": 3.599e-05, "loss": 0.4337, "step": 1401 }, { "epoch": 127.48484848484848, "grad_norm": 0.2521904408931732, "learning_rate": 3.5980000000000004e-05, "loss": 0.4433, "step": 1402 }, { "epoch": 127.58181818181818, "grad_norm": 0.3311968743801117, "learning_rate": 3.597e-05, "loss": 0.4591, "step": 1403 }, { "epoch": 127.67878787878787, "grad_norm": 0.2518157660961151, "learning_rate": 3.596e-05, "loss": 0.4364, "step": 1404 }, { "epoch": 127.77575757575758, "grad_norm": 0.27200594544410706, "learning_rate": 3.595e-05, "loss": 0.4533, "step": 1405 }, { "epoch": 127.87272727272727, "grad_norm": 0.27561336755752563, "learning_rate": 3.594e-05, "loss": 0.4142, "step": 1406 }, { "epoch": 127.96969696969697, "grad_norm": 0.3234183192253113, "learning_rate": 3.593e-05, "loss": 0.4772, "step": 1407 }, { "epoch": 128.0, "grad_norm": 0.5238689184188843, "learning_rate": 3.592e-05, "loss": 0.5281, "step": 1408 }, { "epoch": 128.0969696969697, "grad_norm": 0.26986992359161377, "learning_rate": 3.591e-05, "loss": 0.4498, "step": 1409 }, { "epoch": 128.1939393939394, "grad_norm": 0.23676660656929016, "learning_rate": 3.59e-05, "loss": 0.46, "step": 1410 }, { "epoch": 128.1939393939394, "eval_loss": 0.4915180802345276, "eval_runtime": 2.1483, "eval_samples_per_second": 25.601, "eval_steps_per_second": 3.258, "step": 1410 }, { "epoch": 128.29090909090908, "grad_norm": 0.2594098150730133, "learning_rate": 3.5890000000000004e-05, "loss": 0.4418, "step": 1411 }, { "epoch": 128.38787878787878, "grad_norm": 0.3071761131286621, "learning_rate": 3.588e-05, "loss": 0.4297, "step": 1412 }, { "epoch": 128.4848484848485, "grad_norm": 0.2817282974720001, "learning_rate": 3.587e-05, "loss": 0.4191, "step": 1413 }, { "epoch": 128.5818181818182, "grad_norm": 0.3153073787689209, "learning_rate": 3.586e-05, "loss": 0.4725, "step": 1414 }, { "epoch": 128.6787878787879, "grad_norm": 0.27557572722435, "learning_rate": 3.585e-05, "loss": 0.4442, "step": 1415 }, { "epoch": 128.77575757575758, "grad_norm": 0.3079877495765686, "learning_rate": 3.584e-05, "loss": 0.3941, "step": 1416 }, { "epoch": 128.87272727272727, "grad_norm": 0.2863088548183441, "learning_rate": 3.583e-05, "loss": 0.4048, "step": 1417 }, { "epoch": 128.96969696969697, "grad_norm": 0.27652451395988464, "learning_rate": 3.5820000000000006e-05, "loss": 0.4715, "step": 1418 }, { "epoch": 129.0, "grad_norm": 0.45896708965301514, "learning_rate": 3.581e-05, "loss": 0.3892, "step": 1419 }, { "epoch": 129.0969696969697, "grad_norm": 0.3001515567302704, "learning_rate": 3.58e-05, "loss": 0.4649, "step": 1420 }, { "epoch": 129.0969696969697, "eval_loss": 0.4916605055332184, "eval_runtime": 2.1561, "eval_samples_per_second": 25.51, "eval_steps_per_second": 3.247, "step": 1420 }, { "epoch": 129.1939393939394, "grad_norm": 0.28979921340942383, "learning_rate": 3.579e-05, "loss": 0.4756, "step": 1421 }, { "epoch": 129.29090909090908, "grad_norm": 0.22518810629844666, "learning_rate": 3.578e-05, "loss": 0.3911, "step": 1422 }, { "epoch": 129.38787878787878, "grad_norm": 0.33359140157699585, "learning_rate": 3.5770000000000005e-05, "loss": 0.4618, "step": 1423 }, { "epoch": 129.4848484848485, "grad_norm": 0.26979026198387146, "learning_rate": 3.5759999999999996e-05, "loss": 0.4277, "step": 1424 }, { "epoch": 129.5818181818182, "grad_norm": 0.3089786767959595, "learning_rate": 3.575e-05, "loss": 0.4067, "step": 1425 }, { "epoch": 129.6787878787879, "grad_norm": 0.2348090261220932, "learning_rate": 3.574e-05, "loss": 0.4446, "step": 1426 }, { "epoch": 129.77575757575758, "grad_norm": 0.24655956029891968, "learning_rate": 3.5730000000000005e-05, "loss": 0.425, "step": 1427 }, { "epoch": 129.87272727272727, "grad_norm": 0.23908300697803497, "learning_rate": 3.5720000000000004e-05, "loss": 0.4311, "step": 1428 }, { "epoch": 129.96969696969697, "grad_norm": 0.33590126037597656, "learning_rate": 3.571e-05, "loss": 0.4437, "step": 1429 }, { "epoch": 130.0, "grad_norm": 0.5856208801269531, "learning_rate": 3.57e-05, "loss": 0.3959, "step": 1430 }, { "epoch": 130.0, "eval_loss": 0.49039074778556824, "eval_runtime": 2.1368, "eval_samples_per_second": 25.74, "eval_steps_per_second": 3.276, "step": 1430 }, { "epoch": 130.0969696969697, "grad_norm": 0.2495311051607132, "learning_rate": 3.569e-05, "loss": 0.4326, "step": 1431 }, { "epoch": 130.1939393939394, "grad_norm": 0.23713161051273346, "learning_rate": 3.5680000000000004e-05, "loss": 0.4203, "step": 1432 }, { "epoch": 130.29090909090908, "grad_norm": 0.2517852187156677, "learning_rate": 3.567e-05, "loss": 0.4397, "step": 1433 }, { "epoch": 130.38787878787878, "grad_norm": 0.3342297673225403, "learning_rate": 3.566e-05, "loss": 0.4514, "step": 1434 }, { "epoch": 130.4848484848485, "grad_norm": 0.27150875329971313, "learning_rate": 3.565e-05, "loss": 0.4163, "step": 1435 }, { "epoch": 130.5818181818182, "grad_norm": 0.3103017807006836, "learning_rate": 3.5640000000000004e-05, "loss": 0.4216, "step": 1436 }, { "epoch": 130.6787878787879, "grad_norm": 0.3338560461997986, "learning_rate": 3.563e-05, "loss": 0.4755, "step": 1437 }, { "epoch": 130.77575757575758, "grad_norm": 0.3014073967933655, "learning_rate": 3.562e-05, "loss": 0.4329, "step": 1438 }, { "epoch": 130.87272727272727, "grad_norm": 0.2600906491279602, "learning_rate": 3.5610000000000006e-05, "loss": 0.4158, "step": 1439 }, { "epoch": 130.96969696969697, "grad_norm": 0.2621995210647583, "learning_rate": 3.56e-05, "loss": 0.4183, "step": 1440 }, { "epoch": 130.96969696969697, "eval_loss": 0.48934677243232727, "eval_runtime": 2.1635, "eval_samples_per_second": 25.421, "eval_steps_per_second": 3.235, "step": 1440 }, { "epoch": 131.0, "grad_norm": 0.4798804521560669, "learning_rate": 3.559e-05, "loss": 0.5173, "step": 1441 }, { "epoch": 131.0969696969697, "grad_norm": 0.3138405382633209, "learning_rate": 3.558e-05, "loss": 0.4675, "step": 1442 }, { "epoch": 131.1939393939394, "grad_norm": 0.27567216753959656, "learning_rate": 3.557e-05, "loss": 0.465, "step": 1443 }, { "epoch": 131.29090909090908, "grad_norm": 0.308773934841156, "learning_rate": 3.5560000000000005e-05, "loss": 0.4491, "step": 1444 }, { "epoch": 131.38787878787878, "grad_norm": 0.3976498544216156, "learning_rate": 3.555e-05, "loss": 0.4124, "step": 1445 }, { "epoch": 131.4848484848485, "grad_norm": 0.22643493115901947, "learning_rate": 3.554e-05, "loss": 0.4143, "step": 1446 }, { "epoch": 131.5818181818182, "grad_norm": 0.2682403326034546, "learning_rate": 3.553e-05, "loss": 0.444, "step": 1447 }, { "epoch": 131.6787878787879, "grad_norm": 0.249359592795372, "learning_rate": 3.5520000000000006e-05, "loss": 0.4481, "step": 1448 }, { "epoch": 131.77575757575758, "grad_norm": 0.31316664814949036, "learning_rate": 3.5510000000000004e-05, "loss": 0.4161, "step": 1449 }, { "epoch": 131.87272727272727, "grad_norm": 0.3224125802516937, "learning_rate": 3.55e-05, "loss": 0.4131, "step": 1450 }, { "epoch": 131.87272727272727, "eval_loss": 0.48860692977905273, "eval_runtime": 2.166, "eval_samples_per_second": 25.392, "eval_steps_per_second": 3.232, "step": 1450 }, { "epoch": 131.96969696969697, "grad_norm": 0.2814408540725708, "learning_rate": 3.549e-05, "loss": 0.4149, "step": 1451 }, { "epoch": 132.0, "grad_norm": 0.46793943643569946, "learning_rate": 3.548e-05, "loss": 0.4296, "step": 1452 }, { "epoch": 132.0969696969697, "grad_norm": 0.24512231349945068, "learning_rate": 3.5470000000000004e-05, "loss": 0.4516, "step": 1453 }, { "epoch": 132.1939393939394, "grad_norm": 0.2817038893699646, "learning_rate": 3.546e-05, "loss": 0.4837, "step": 1454 }, { "epoch": 132.29090909090908, "grad_norm": 0.2967666983604431, "learning_rate": 3.545e-05, "loss": 0.3845, "step": 1455 }, { "epoch": 132.38787878787878, "grad_norm": 0.28917327523231506, "learning_rate": 3.544e-05, "loss": 0.467, "step": 1456 }, { "epoch": 132.4848484848485, "grad_norm": 0.2414083033800125, "learning_rate": 3.5430000000000005e-05, "loss": 0.4025, "step": 1457 }, { "epoch": 132.5818181818182, "grad_norm": 0.3054693639278412, "learning_rate": 3.542e-05, "loss": 0.4138, "step": 1458 }, { "epoch": 132.6787878787879, "grad_norm": 0.2707059681415558, "learning_rate": 3.541e-05, "loss": 0.443, "step": 1459 }, { "epoch": 132.77575757575758, "grad_norm": 0.3007029592990875, "learning_rate": 3.54e-05, "loss": 0.4541, "step": 1460 }, { "epoch": 132.77575757575758, "eval_loss": 0.4879237115383148, "eval_runtime": 2.1713, "eval_samples_per_second": 25.331, "eval_steps_per_second": 3.224, "step": 1460 }, { "epoch": 132.87272727272727, "grad_norm": 0.27469393610954285, "learning_rate": 3.539e-05, "loss": 0.4028, "step": 1461 }, { "epoch": 132.96969696969697, "grad_norm": 0.2680533528327942, "learning_rate": 3.5380000000000003e-05, "loss": 0.4265, "step": 1462 }, { "epoch": 133.0, "grad_norm": 0.46730121970176697, "learning_rate": 3.537e-05, "loss": 0.4283, "step": 1463 }, { "epoch": 133.0969696969697, "grad_norm": 0.29606109857559204, "learning_rate": 3.536000000000001e-05, "loss": 0.471, "step": 1464 }, { "epoch": 133.1939393939394, "grad_norm": 0.2748776078224182, "learning_rate": 3.535e-05, "loss": 0.4334, "step": 1465 }, { "epoch": 133.29090909090908, "grad_norm": 0.2659863233566284, "learning_rate": 3.5340000000000004e-05, "loss": 0.4235, "step": 1466 }, { "epoch": 133.38787878787878, "grad_norm": 0.23002707958221436, "learning_rate": 3.533e-05, "loss": 0.4527, "step": 1467 }, { "epoch": 133.4848484848485, "grad_norm": 0.252360075712204, "learning_rate": 3.532e-05, "loss": 0.4203, "step": 1468 }, { "epoch": 133.5818181818182, "grad_norm": 0.2285648137331009, "learning_rate": 3.5310000000000006e-05, "loss": 0.4021, "step": 1469 }, { "epoch": 133.6787878787879, "grad_norm": 0.308756947517395, "learning_rate": 3.53e-05, "loss": 0.3883, "step": 1470 }, { "epoch": 133.6787878787879, "eval_loss": 0.4869600832462311, "eval_runtime": 2.1755, "eval_samples_per_second": 25.281, "eval_steps_per_second": 3.218, "step": 1470 }, { "epoch": 133.77575757575758, "grad_norm": 0.36151477694511414, "learning_rate": 3.529e-05, "loss": 0.4449, "step": 1471 }, { "epoch": 133.87272727272727, "grad_norm": 0.27611294388771057, "learning_rate": 3.528e-05, "loss": 0.4614, "step": 1472 }, { "epoch": 133.96969696969697, "grad_norm": 0.32699742913246155, "learning_rate": 3.5270000000000006e-05, "loss": 0.42, "step": 1473 }, { "epoch": 134.0, "grad_norm": 0.6019296050071716, "learning_rate": 3.5260000000000005e-05, "loss": 0.4283, "step": 1474 }, { "epoch": 134.0969696969697, "grad_norm": 0.279818058013916, "learning_rate": 3.525e-05, "loss": 0.3954, "step": 1475 }, { "epoch": 134.1939393939394, "grad_norm": 0.269252747297287, "learning_rate": 3.524e-05, "loss": 0.3926, "step": 1476 }, { "epoch": 134.29090909090908, "grad_norm": 0.2596481144428253, "learning_rate": 3.523e-05, "loss": 0.4227, "step": 1477 }, { "epoch": 134.38787878787878, "grad_norm": 0.30212676525115967, "learning_rate": 3.5220000000000005e-05, "loss": 0.4182, "step": 1478 }, { "epoch": 134.4848484848485, "grad_norm": 0.3808242678642273, "learning_rate": 3.5210000000000003e-05, "loss": 0.4378, "step": 1479 }, { "epoch": 134.5818181818182, "grad_norm": 0.28346386551856995, "learning_rate": 3.52e-05, "loss": 0.4492, "step": 1480 }, { "epoch": 134.5818181818182, "eval_loss": 0.48657190799713135, "eval_runtime": 2.1723, "eval_samples_per_second": 25.319, "eval_steps_per_second": 3.222, "step": 1480 }, { "epoch": 134.6787878787879, "grad_norm": 0.3859547972679138, "learning_rate": 3.519e-05, "loss": 0.4525, "step": 1481 }, { "epoch": 134.77575757575758, "grad_norm": 0.2632039487361908, "learning_rate": 3.518e-05, "loss": 0.4463, "step": 1482 }, { "epoch": 134.87272727272727, "grad_norm": 0.256745308637619, "learning_rate": 3.5170000000000004e-05, "loss": 0.4196, "step": 1483 }, { "epoch": 134.96969696969697, "grad_norm": 0.3404027223587036, "learning_rate": 3.516e-05, "loss": 0.4425, "step": 1484 }, { "epoch": 135.0, "grad_norm": 0.45514726638793945, "learning_rate": 3.515e-05, "loss": 0.529, "step": 1485 }, { "epoch": 135.0969696969697, "grad_norm": 0.2929936945438385, "learning_rate": 3.514e-05, "loss": 0.4281, "step": 1486 }, { "epoch": 135.1939393939394, "grad_norm": 0.3744279444217682, "learning_rate": 3.5130000000000004e-05, "loss": 0.3906, "step": 1487 }, { "epoch": 135.29090909090908, "grad_norm": 0.30043625831604004, "learning_rate": 3.512e-05, "loss": 0.4551, "step": 1488 }, { "epoch": 135.38787878787878, "grad_norm": 0.23369665443897247, "learning_rate": 3.511e-05, "loss": 0.4127, "step": 1489 }, { "epoch": 135.4848484848485, "grad_norm": 0.32619598507881165, "learning_rate": 3.51e-05, "loss": 0.426, "step": 1490 }, { "epoch": 135.4848484848485, "eval_loss": 0.4854676425457001, "eval_runtime": 2.136, "eval_samples_per_second": 25.749, "eval_steps_per_second": 3.277, "step": 1490 }, { "epoch": 135.5818181818182, "grad_norm": 0.24827663600444794, "learning_rate": 3.509e-05, "loss": 0.4259, "step": 1491 }, { "epoch": 135.6787878787879, "grad_norm": 0.239982470870018, "learning_rate": 3.508e-05, "loss": 0.4167, "step": 1492 }, { "epoch": 135.77575757575758, "grad_norm": 0.28403112292289734, "learning_rate": 3.507e-05, "loss": 0.4565, "step": 1493 }, { "epoch": 135.87272727272727, "grad_norm": 0.31020352244377136, "learning_rate": 3.5060000000000007e-05, "loss": 0.4335, "step": 1494 }, { "epoch": 135.96969696969697, "grad_norm": 0.2941436767578125, "learning_rate": 3.505e-05, "loss": 0.4491, "step": 1495 }, { "epoch": 136.0, "grad_norm": 0.42165759205818176, "learning_rate": 3.504e-05, "loss": 0.4343, "step": 1496 }, { "epoch": 136.0969696969697, "grad_norm": 0.2817646563053131, "learning_rate": 3.503e-05, "loss": 0.3876, "step": 1497 }, { "epoch": 136.1939393939394, "grad_norm": 0.2687471807003021, "learning_rate": 3.502e-05, "loss": 0.4572, "step": 1498 }, { "epoch": 136.29090909090908, "grad_norm": 0.2752440571784973, "learning_rate": 3.5010000000000005e-05, "loss": 0.4754, "step": 1499 }, { "epoch": 136.38787878787878, "grad_norm": 0.26806142926216125, "learning_rate": 3.5e-05, "loss": 0.4099, "step": 1500 }, { "epoch": 136.38787878787878, "eval_loss": 0.48457300662994385, "eval_runtime": 2.1595, "eval_samples_per_second": 25.469, "eval_steps_per_second": 3.242, "step": 1500 }, { "epoch": 136.4848484848485, "grad_norm": 0.26220443844795227, "learning_rate": 3.499e-05, "loss": 0.4728, "step": 1501 }, { "epoch": 136.5818181818182, "grad_norm": 0.29137399792671204, "learning_rate": 3.498e-05, "loss": 0.4426, "step": 1502 }, { "epoch": 136.6787878787879, "grad_norm": 0.28184735774993896, "learning_rate": 3.4970000000000006e-05, "loss": 0.3705, "step": 1503 }, { "epoch": 136.77575757575758, "grad_norm": 0.26435405015945435, "learning_rate": 3.4960000000000004e-05, "loss": 0.4361, "step": 1504 }, { "epoch": 136.87272727272727, "grad_norm": 0.29367130994796753, "learning_rate": 3.495e-05, "loss": 0.4451, "step": 1505 }, { "epoch": 136.96969696969697, "grad_norm": 0.35805878043174744, "learning_rate": 3.494e-05, "loss": 0.3964, "step": 1506 }, { "epoch": 137.0, "grad_norm": 0.45524048805236816, "learning_rate": 3.493e-05, "loss": 0.4054, "step": 1507 }, { "epoch": 137.0969696969697, "grad_norm": 0.26967760920524597, "learning_rate": 3.4920000000000004e-05, "loss": 0.4508, "step": 1508 }, { "epoch": 137.1939393939394, "grad_norm": 0.25061696767807007, "learning_rate": 3.491e-05, "loss": 0.4245, "step": 1509 }, { "epoch": 137.29090909090908, "grad_norm": 0.24153633415699005, "learning_rate": 3.49e-05, "loss": 0.4106, "step": 1510 }, { "epoch": 137.29090909090908, "eval_loss": 0.4839669167995453, "eval_runtime": 2.1707, "eval_samples_per_second": 25.338, "eval_steps_per_second": 3.225, "step": 1510 }, { "epoch": 137.38787878787878, "grad_norm": 0.2820562422275543, "learning_rate": 3.489e-05, "loss": 0.4411, "step": 1511 }, { "epoch": 137.4848484848485, "grad_norm": 0.26837873458862305, "learning_rate": 3.4880000000000005e-05, "loss": 0.4497, "step": 1512 }, { "epoch": 137.5818181818182, "grad_norm": 0.2681138813495636, "learning_rate": 3.487e-05, "loss": 0.4102, "step": 1513 }, { "epoch": 137.6787878787879, "grad_norm": 0.27645957469940186, "learning_rate": 3.486e-05, "loss": 0.4344, "step": 1514 }, { "epoch": 137.77575757575758, "grad_norm": 0.25688016414642334, "learning_rate": 3.485e-05, "loss": 0.4236, "step": 1515 }, { "epoch": 137.87272727272727, "grad_norm": 0.3318941593170166, "learning_rate": 3.484e-05, "loss": 0.4538, "step": 1516 }, { "epoch": 137.96969696969697, "grad_norm": 0.22465866804122925, "learning_rate": 3.4830000000000004e-05, "loss": 0.3973, "step": 1517 }, { "epoch": 138.0, "grad_norm": 0.36190637946128845, "learning_rate": 3.482e-05, "loss": 0.3488, "step": 1518 }, { "epoch": 138.0969696969697, "grad_norm": 0.2324100136756897, "learning_rate": 3.481e-05, "loss": 0.4149, "step": 1519 }, { "epoch": 138.1939393939394, "grad_norm": 0.2485891580581665, "learning_rate": 3.48e-05, "loss": 0.4321, "step": 1520 }, { "epoch": 138.1939393939394, "eval_loss": 0.4832867980003357, "eval_runtime": 2.1612, "eval_samples_per_second": 25.449, "eval_steps_per_second": 3.239, "step": 1520 }, { "epoch": 138.29090909090908, "grad_norm": 0.2955532371997833, "learning_rate": 3.479e-05, "loss": 0.4417, "step": 1521 }, { "epoch": 138.38787878787878, "grad_norm": 0.2552751302719116, "learning_rate": 3.478e-05, "loss": 0.4241, "step": 1522 }, { "epoch": 138.4848484848485, "grad_norm": 0.27413198351860046, "learning_rate": 3.477e-05, "loss": 0.4504, "step": 1523 }, { "epoch": 138.5818181818182, "grad_norm": 0.23463669419288635, "learning_rate": 3.4760000000000006e-05, "loss": 0.4139, "step": 1524 }, { "epoch": 138.6787878787879, "grad_norm": 0.2654094696044922, "learning_rate": 3.475e-05, "loss": 0.4482, "step": 1525 }, { "epoch": 138.77575757575758, "grad_norm": 0.3383115828037262, "learning_rate": 3.474e-05, "loss": 0.4053, "step": 1526 }, { "epoch": 138.87272727272727, "grad_norm": 0.3001095652580261, "learning_rate": 3.473e-05, "loss": 0.4237, "step": 1527 }, { "epoch": 138.96969696969697, "grad_norm": 0.31264686584472656, "learning_rate": 3.472e-05, "loss": 0.4034, "step": 1528 }, { "epoch": 139.0, "grad_norm": 0.500529944896698, "learning_rate": 3.4710000000000005e-05, "loss": 0.4396, "step": 1529 }, { "epoch": 139.0969696969697, "grad_norm": 0.26789966225624084, "learning_rate": 3.4699999999999996e-05, "loss": 0.4432, "step": 1530 }, { "epoch": 139.0969696969697, "eval_loss": 0.48245444893836975, "eval_runtime": 2.1444, "eval_samples_per_second": 25.648, "eval_steps_per_second": 3.264, "step": 1530 }, { "epoch": 139.1939393939394, "grad_norm": 0.29543471336364746, "learning_rate": 3.469e-05, "loss": 0.4696, "step": 1531 }, { "epoch": 139.29090909090908, "grad_norm": 0.2340238094329834, "learning_rate": 3.468e-05, "loss": 0.3868, "step": 1532 }, { "epoch": 139.38787878787878, "grad_norm": 0.2654852271080017, "learning_rate": 3.4670000000000005e-05, "loss": 0.4385, "step": 1533 }, { "epoch": 139.4848484848485, "grad_norm": 0.2928999364376068, "learning_rate": 3.4660000000000004e-05, "loss": 0.4506, "step": 1534 }, { "epoch": 139.5818181818182, "grad_norm": 0.3032337725162506, "learning_rate": 3.465e-05, "loss": 0.3848, "step": 1535 }, { "epoch": 139.6787878787879, "grad_norm": 0.340878963470459, "learning_rate": 3.464e-05, "loss": 0.3951, "step": 1536 }, { "epoch": 139.77575757575758, "grad_norm": 0.25129806995391846, "learning_rate": 3.463e-05, "loss": 0.4616, "step": 1537 }, { "epoch": 139.87272727272727, "grad_norm": 0.30390605330467224, "learning_rate": 3.4620000000000004e-05, "loss": 0.4478, "step": 1538 }, { "epoch": 139.96969696969697, "grad_norm": 0.24877336621284485, "learning_rate": 3.461e-05, "loss": 0.3916, "step": 1539 }, { "epoch": 140.0, "grad_norm": 0.4447900354862213, "learning_rate": 3.46e-05, "loss": 0.3716, "step": 1540 }, { "epoch": 140.0, "eval_loss": 0.4822714030742645, "eval_runtime": 2.1583, "eval_samples_per_second": 25.483, "eval_steps_per_second": 3.243, "step": 1540 }, { "epoch": 140.0969696969697, "grad_norm": 0.31359797716140747, "learning_rate": 3.459e-05, "loss": 0.441, "step": 1541 }, { "epoch": 140.1939393939394, "grad_norm": 0.27479588985443115, "learning_rate": 3.4580000000000004e-05, "loss": 0.4205, "step": 1542 }, { "epoch": 140.29090909090908, "grad_norm": 0.2537696361541748, "learning_rate": 3.457e-05, "loss": 0.4271, "step": 1543 }, { "epoch": 140.38787878787878, "grad_norm": 0.30846652388572693, "learning_rate": 3.456e-05, "loss": 0.4139, "step": 1544 }, { "epoch": 140.4848484848485, "grad_norm": 0.3260568082332611, "learning_rate": 3.455e-05, "loss": 0.4388, "step": 1545 }, { "epoch": 140.5818181818182, "grad_norm": 0.27833911776542664, "learning_rate": 3.454e-05, "loss": 0.422, "step": 1546 }, { "epoch": 140.6787878787879, "grad_norm": 0.3193536698818207, "learning_rate": 3.453e-05, "loss": 0.4182, "step": 1547 }, { "epoch": 140.77575757575758, "grad_norm": 0.2909131646156311, "learning_rate": 3.452e-05, "loss": 0.418, "step": 1548 }, { "epoch": 140.87272727272727, "grad_norm": 0.2805783748626709, "learning_rate": 3.451000000000001e-05, "loss": 0.4181, "step": 1549 }, { "epoch": 140.96969696969697, "grad_norm": 0.2544325590133667, "learning_rate": 3.45e-05, "loss": 0.4336, "step": 1550 }, { "epoch": 140.96969696969697, "eval_loss": 0.4808160364627838, "eval_runtime": 2.1672, "eval_samples_per_second": 25.378, "eval_steps_per_second": 3.23, "step": 1550 }, { "epoch": 141.0, "grad_norm": 0.5855840444564819, "learning_rate": 3.449e-05, "loss": 0.4099, "step": 1551 }, { "epoch": 141.0969696969697, "grad_norm": 0.2724772095680237, "learning_rate": 3.448e-05, "loss": 0.4489, "step": 1552 }, { "epoch": 141.1939393939394, "grad_norm": 0.2569520175457001, "learning_rate": 3.447e-05, "loss": 0.4341, "step": 1553 }, { "epoch": 141.29090909090908, "grad_norm": 0.24317876994609833, "learning_rate": 3.4460000000000005e-05, "loss": 0.4252, "step": 1554 }, { "epoch": 141.38787878787878, "grad_norm": 0.24204610288143158, "learning_rate": 3.445e-05, "loss": 0.3995, "step": 1555 }, { "epoch": 141.4848484848485, "grad_norm": 0.2994146943092346, "learning_rate": 3.444e-05, "loss": 0.4007, "step": 1556 }, { "epoch": 141.5818181818182, "grad_norm": 0.2774331569671631, "learning_rate": 3.443e-05, "loss": 0.4026, "step": 1557 }, { "epoch": 141.6787878787879, "grad_norm": 0.267837792634964, "learning_rate": 3.442e-05, "loss": 0.3982, "step": 1558 }, { "epoch": 141.77575757575758, "grad_norm": 0.3514941334724426, "learning_rate": 3.4410000000000004e-05, "loss": 0.4719, "step": 1559 }, { "epoch": 141.87272727272727, "grad_norm": 0.3003464937210083, "learning_rate": 3.4399999999999996e-05, "loss": 0.4297, "step": 1560 }, { "epoch": 141.87272727272727, "eval_loss": 0.4802820384502411, "eval_runtime": 2.1484, "eval_samples_per_second": 25.6, "eval_steps_per_second": 3.258, "step": 1560 }, { "epoch": 141.96969696969697, "grad_norm": 0.25958773493766785, "learning_rate": 3.439e-05, "loss": 0.4322, "step": 1561 }, { "epoch": 142.0, "grad_norm": 0.45551276206970215, "learning_rate": 3.438e-05, "loss": 0.388, "step": 1562 }, { "epoch": 142.0969696969697, "grad_norm": 0.22718825936317444, "learning_rate": 3.4370000000000005e-05, "loss": 0.3969, "step": 1563 }, { "epoch": 142.1939393939394, "grad_norm": 0.2647274434566498, "learning_rate": 3.436e-05, "loss": 0.4392, "step": 1564 }, { "epoch": 142.29090909090908, "grad_norm": 0.2525234520435333, "learning_rate": 3.435e-05, "loss": 0.4372, "step": 1565 }, { "epoch": 142.38787878787878, "grad_norm": 0.27025994658470154, "learning_rate": 3.434e-05, "loss": 0.4349, "step": 1566 }, { "epoch": 142.4848484848485, "grad_norm": 0.33979251980781555, "learning_rate": 3.433e-05, "loss": 0.4484, "step": 1567 }, { "epoch": 142.5818181818182, "grad_norm": 0.28829941153526306, "learning_rate": 3.4320000000000003e-05, "loss": 0.4131, "step": 1568 }, { "epoch": 142.6787878787879, "grad_norm": 0.29526832699775696, "learning_rate": 3.431e-05, "loss": 0.4117, "step": 1569 }, { "epoch": 142.77575757575758, "grad_norm": 0.3364526331424713, "learning_rate": 3.430000000000001e-05, "loss": 0.4403, "step": 1570 }, { "epoch": 142.77575757575758, "eval_loss": 0.47912371158599854, "eval_runtime": 2.1728, "eval_samples_per_second": 25.313, "eval_steps_per_second": 3.222, "step": 1570 }, { "epoch": 142.87272727272727, "grad_norm": 0.27434784173965454, "learning_rate": 3.429e-05, "loss": 0.3931, "step": 1571 }, { "epoch": 142.96969696969697, "grad_norm": 0.2859874963760376, "learning_rate": 3.4280000000000004e-05, "loss": 0.4199, "step": 1572 }, { "epoch": 143.0, "grad_norm": 0.49399182200431824, "learning_rate": 3.427e-05, "loss": 0.3853, "step": 1573 }, { "epoch": 143.0969696969697, "grad_norm": 0.2726399898529053, "learning_rate": 3.426e-05, "loss": 0.4038, "step": 1574 }, { "epoch": 143.1939393939394, "grad_norm": 0.32016149163246155, "learning_rate": 3.4250000000000006e-05, "loss": 0.4222, "step": 1575 }, { "epoch": 143.29090909090908, "grad_norm": 0.32801294326782227, "learning_rate": 3.424e-05, "loss": 0.4029, "step": 1576 }, { "epoch": 143.38787878787878, "grad_norm": 0.3783630132675171, "learning_rate": 3.423e-05, "loss": 0.4632, "step": 1577 }, { "epoch": 143.4848484848485, "grad_norm": 0.503115177154541, "learning_rate": 3.422e-05, "loss": 0.391, "step": 1578 }, { "epoch": 143.5818181818182, "grad_norm": 0.3633951246738434, "learning_rate": 3.4210000000000006e-05, "loss": 0.4029, "step": 1579 }, { "epoch": 143.6787878787879, "grad_norm": 0.21935221552848816, "learning_rate": 3.4200000000000005e-05, "loss": 0.3888, "step": 1580 }, { "epoch": 143.6787878787879, "eval_loss": 0.4786372780799866, "eval_runtime": 2.1746, "eval_samples_per_second": 25.292, "eval_steps_per_second": 3.219, "step": 1580 }, { "epoch": 143.77575757575758, "grad_norm": 0.30795830488204956, "learning_rate": 3.419e-05, "loss": 0.4336, "step": 1581 }, { "epoch": 143.87272727272727, "grad_norm": 0.27631527185440063, "learning_rate": 3.418e-05, "loss": 0.4377, "step": 1582 }, { "epoch": 143.96969696969697, "grad_norm": 0.32671844959259033, "learning_rate": 3.417e-05, "loss": 0.4654, "step": 1583 }, { "epoch": 144.0, "grad_norm": 0.5490805506706238, "learning_rate": 3.4160000000000005e-05, "loss": 0.4344, "step": 1584 }, { "epoch": 144.0969696969697, "grad_norm": 0.31319963932037354, "learning_rate": 3.415e-05, "loss": 0.4453, "step": 1585 }, { "epoch": 144.1939393939394, "grad_norm": 0.2572191059589386, "learning_rate": 3.414e-05, "loss": 0.4076, "step": 1586 }, { "epoch": 144.29090909090908, "grad_norm": 0.2999926507472992, "learning_rate": 3.413e-05, "loss": 0.438, "step": 1587 }, { "epoch": 144.38787878787878, "grad_norm": 0.29962679743766785, "learning_rate": 3.412e-05, "loss": 0.4267, "step": 1588 }, { "epoch": 144.4848484848485, "grad_norm": 0.28824809193611145, "learning_rate": 3.4110000000000004e-05, "loss": 0.4185, "step": 1589 }, { "epoch": 144.5818181818182, "grad_norm": 0.3261265456676483, "learning_rate": 3.41e-05, "loss": 0.4141, "step": 1590 }, { "epoch": 144.5818181818182, "eval_loss": 0.47886165976524353, "eval_runtime": 2.1489, "eval_samples_per_second": 25.595, "eval_steps_per_second": 3.258, "step": 1590 }, { "epoch": 144.6787878787879, "grad_norm": 0.3977707028388977, "learning_rate": 3.409e-05, "loss": 0.4051, "step": 1591 }, { "epoch": 144.77575757575758, "grad_norm": 0.3052314221858978, "learning_rate": 3.408e-05, "loss": 0.3797, "step": 1592 }, { "epoch": 144.87272727272727, "grad_norm": 0.2947228252887726, "learning_rate": 3.4070000000000004e-05, "loss": 0.4332, "step": 1593 }, { "epoch": 144.96969696969697, "grad_norm": 0.2849595844745636, "learning_rate": 3.406e-05, "loss": 0.4327, "step": 1594 }, { "epoch": 145.0, "grad_norm": 0.45358598232269287, "learning_rate": 3.405e-05, "loss": 0.4358, "step": 1595 }, { "epoch": 145.0969696969697, "grad_norm": 0.30770865082740784, "learning_rate": 3.404e-05, "loss": 0.3994, "step": 1596 }, { "epoch": 145.1939393939394, "grad_norm": 0.4116463363170624, "learning_rate": 3.403e-05, "loss": 0.408, "step": 1597 }, { "epoch": 145.29090909090908, "grad_norm": 0.3829961121082306, "learning_rate": 3.402e-05, "loss": 0.4197, "step": 1598 }, { "epoch": 145.38787878787878, "grad_norm": 0.3223172128200531, "learning_rate": 3.401e-05, "loss": 0.4271, "step": 1599 }, { "epoch": 145.4848484848485, "grad_norm": 0.29979923367500305, "learning_rate": 3.4000000000000007e-05, "loss": 0.4279, "step": 1600 }, { "epoch": 145.4848484848485, "eval_loss": 0.47824418544769287, "eval_runtime": 2.1476, "eval_samples_per_second": 25.61, "eval_steps_per_second": 3.259, "step": 1600 }, { "epoch": 145.5818181818182, "grad_norm": 0.29526492953300476, "learning_rate": 3.399e-05, "loss": 0.4245, "step": 1601 }, { "epoch": 145.6787878787879, "grad_norm": 0.39535799622535706, "learning_rate": 3.398e-05, "loss": 0.4542, "step": 1602 }, { "epoch": 145.77575757575758, "grad_norm": 0.3696836531162262, "learning_rate": 3.397e-05, "loss": 0.4573, "step": 1603 }, { "epoch": 145.87272727272727, "grad_norm": 0.2670351564884186, "learning_rate": 3.396e-05, "loss": 0.3654, "step": 1604 }, { "epoch": 145.96969696969697, "grad_norm": 0.2809720039367676, "learning_rate": 3.3950000000000005e-05, "loss": 0.4188, "step": 1605 }, { "epoch": 146.0, "grad_norm": 0.491420179605484, "learning_rate": 3.394e-05, "loss": 0.4057, "step": 1606 }, { "epoch": 146.0969696969697, "grad_norm": 0.22479571402072906, "learning_rate": 3.393e-05, "loss": 0.4385, "step": 1607 }, { "epoch": 146.1939393939394, "grad_norm": 0.3089130222797394, "learning_rate": 3.392e-05, "loss": 0.42, "step": 1608 }, { "epoch": 146.29090909090908, "grad_norm": 0.3012380301952362, "learning_rate": 3.3910000000000006e-05, "loss": 0.4482, "step": 1609 }, { "epoch": 146.38787878787878, "grad_norm": 0.2956259846687317, "learning_rate": 3.3900000000000004e-05, "loss": 0.4254, "step": 1610 }, { "epoch": 146.38787878787878, "eval_loss": 0.47690457105636597, "eval_runtime": 2.1714, "eval_samples_per_second": 25.329, "eval_steps_per_second": 3.224, "step": 1610 }, { "epoch": 146.4848484848485, "grad_norm": 0.2577868103981018, "learning_rate": 3.389e-05, "loss": 0.378, "step": 1611 }, { "epoch": 146.5818181818182, "grad_norm": 0.28192633390426636, "learning_rate": 3.388e-05, "loss": 0.3782, "step": 1612 }, { "epoch": 146.6787878787879, "grad_norm": 0.2914888262748718, "learning_rate": 3.387e-05, "loss": 0.4122, "step": 1613 }, { "epoch": 146.77575757575758, "grad_norm": 0.3398047983646393, "learning_rate": 3.3860000000000004e-05, "loss": 0.4108, "step": 1614 }, { "epoch": 146.87272727272727, "grad_norm": 0.32673320174217224, "learning_rate": 3.385e-05, "loss": 0.46, "step": 1615 }, { "epoch": 146.96969696969697, "grad_norm": 0.29334592819213867, "learning_rate": 3.384e-05, "loss": 0.4148, "step": 1616 }, { "epoch": 147.0, "grad_norm": 0.43029725551605225, "learning_rate": 3.383e-05, "loss": 0.4231, "step": 1617 }, { "epoch": 147.0969696969697, "grad_norm": 0.27203699946403503, "learning_rate": 3.3820000000000005e-05, "loss": 0.4093, "step": 1618 }, { "epoch": 147.1939393939394, "grad_norm": 0.3553725779056549, "learning_rate": 3.381e-05, "loss": 0.4163, "step": 1619 }, { "epoch": 147.29090909090908, "grad_norm": 0.30790579319000244, "learning_rate": 3.38e-05, "loss": 0.4504, "step": 1620 }, { "epoch": 147.29090909090908, "eval_loss": 0.4763954281806946, "eval_runtime": 2.1544, "eval_samples_per_second": 25.529, "eval_steps_per_second": 3.249, "step": 1620 }, { "epoch": 147.38787878787878, "grad_norm": 0.29896020889282227, "learning_rate": 3.379e-05, "loss": 0.4409, "step": 1621 }, { "epoch": 147.4848484848485, "grad_norm": 0.300333172082901, "learning_rate": 3.378e-05, "loss": 0.4207, "step": 1622 }, { "epoch": 147.5818181818182, "grad_norm": 0.2776268422603607, "learning_rate": 3.3770000000000004e-05, "loss": 0.4463, "step": 1623 }, { "epoch": 147.6787878787879, "grad_norm": 0.3045647442340851, "learning_rate": 3.376e-05, "loss": 0.3936, "step": 1624 }, { "epoch": 147.77575757575758, "grad_norm": 0.2653445601463318, "learning_rate": 3.375000000000001e-05, "loss": 0.3902, "step": 1625 }, { "epoch": 147.87272727272727, "grad_norm": 0.26968562602996826, "learning_rate": 3.374e-05, "loss": 0.4058, "step": 1626 }, { "epoch": 147.96969696969697, "grad_norm": 0.2962372601032257, "learning_rate": 3.373e-05, "loss": 0.4248, "step": 1627 }, { "epoch": 148.0, "grad_norm": 0.42462968826293945, "learning_rate": 3.372e-05, "loss": 0.3493, "step": 1628 }, { "epoch": 148.0969696969697, "grad_norm": 0.2580646574497223, "learning_rate": 3.371e-05, "loss": 0.3995, "step": 1629 }, { "epoch": 148.1939393939394, "grad_norm": 0.2709900438785553, "learning_rate": 3.3700000000000006e-05, "loss": 0.4414, "step": 1630 }, { "epoch": 148.1939393939394, "eval_loss": 0.476203054189682, "eval_runtime": 2.1468, "eval_samples_per_second": 25.62, "eval_steps_per_second": 3.261, "step": 1630 }, { "epoch": 148.29090909090908, "grad_norm": 0.24366122484207153, "learning_rate": 3.369e-05, "loss": 0.4154, "step": 1631 }, { "epoch": 148.38787878787878, "grad_norm": 0.29152488708496094, "learning_rate": 3.368e-05, "loss": 0.375, "step": 1632 }, { "epoch": 148.4848484848485, "grad_norm": 0.285420298576355, "learning_rate": 3.367e-05, "loss": 0.43, "step": 1633 }, { "epoch": 148.5818181818182, "grad_norm": 0.29472947120666504, "learning_rate": 3.366e-05, "loss": 0.4215, "step": 1634 }, { "epoch": 148.6787878787879, "grad_norm": 0.45701467990875244, "learning_rate": 3.3650000000000005e-05, "loss": 0.3847, "step": 1635 }, { "epoch": 148.77575757575758, "grad_norm": 0.2992357313632965, "learning_rate": 3.3639999999999996e-05, "loss": 0.4468, "step": 1636 }, { "epoch": 148.87272727272727, "grad_norm": 0.21454013884067535, "learning_rate": 3.363e-05, "loss": 0.3987, "step": 1637 }, { "epoch": 148.96969696969697, "grad_norm": 0.2769618034362793, "learning_rate": 3.362e-05, "loss": 0.4434, "step": 1638 }, { "epoch": 149.0, "grad_norm": 0.5300103425979614, "learning_rate": 3.3610000000000005e-05, "loss": 0.4426, "step": 1639 }, { "epoch": 149.0969696969697, "grad_norm": 0.2771533131599426, "learning_rate": 3.3600000000000004e-05, "loss": 0.4, "step": 1640 }, { "epoch": 149.0969696969697, "eval_loss": 0.4750164747238159, "eval_runtime": 2.159, "eval_samples_per_second": 25.475, "eval_steps_per_second": 3.242, "step": 1640 }, { "epoch": 149.1939393939394, "grad_norm": 0.3044075667858124, "learning_rate": 3.359e-05, "loss": 0.3797, "step": 1641 }, { "epoch": 149.29090909090908, "grad_norm": 0.2780078053474426, "learning_rate": 3.358e-05, "loss": 0.4281, "step": 1642 }, { "epoch": 149.38787878787878, "grad_norm": 0.27099543809890747, "learning_rate": 3.357e-05, "loss": 0.4213, "step": 1643 }, { "epoch": 149.4848484848485, "grad_norm": 0.30960965156555176, "learning_rate": 3.3560000000000004e-05, "loss": 0.4088, "step": 1644 }, { "epoch": 149.5818181818182, "grad_norm": 0.29440176486968994, "learning_rate": 3.355e-05, "loss": 0.4539, "step": 1645 }, { "epoch": 149.6787878787879, "grad_norm": 0.31968364119529724, "learning_rate": 3.354e-05, "loss": 0.3991, "step": 1646 }, { "epoch": 149.77575757575758, "grad_norm": 0.2963279187679291, "learning_rate": 3.353e-05, "loss": 0.4245, "step": 1647 }, { "epoch": 149.87272727272727, "grad_norm": 0.26230543851852417, "learning_rate": 3.3520000000000004e-05, "loss": 0.4083, "step": 1648 }, { "epoch": 149.96969696969697, "grad_norm": 0.30306246876716614, "learning_rate": 3.351e-05, "loss": 0.4338, "step": 1649 }, { "epoch": 150.0, "grad_norm": 0.4665185511112213, "learning_rate": 3.35e-05, "loss": 0.4254, "step": 1650 }, { "epoch": 150.0, "eval_loss": 0.475295752286911, "eval_runtime": 2.1709, "eval_samples_per_second": 25.335, "eval_steps_per_second": 3.225, "step": 1650 }, { "epoch": 150.0969696969697, "grad_norm": 0.3292195796966553, "learning_rate": 3.349e-05, "loss": 0.3975, "step": 1651 }, { "epoch": 150.1939393939394, "grad_norm": 0.25012484192848206, "learning_rate": 3.348e-05, "loss": 0.378, "step": 1652 }, { "epoch": 150.29090909090908, "grad_norm": 0.31056174635887146, "learning_rate": 3.347e-05, "loss": 0.445, "step": 1653 }, { "epoch": 150.38787878787878, "grad_norm": 0.3301188051700592, "learning_rate": 3.346e-05, "loss": 0.4353, "step": 1654 }, { "epoch": 150.4848484848485, "grad_norm": 0.3136032819747925, "learning_rate": 3.345000000000001e-05, "loss": 0.4452, "step": 1655 }, { "epoch": 150.5818181818182, "grad_norm": 0.31803566217422485, "learning_rate": 3.344e-05, "loss": 0.3753, "step": 1656 }, { "epoch": 150.6787878787879, "grad_norm": 0.28205081820487976, "learning_rate": 3.3430000000000003e-05, "loss": 0.4514, "step": 1657 }, { "epoch": 150.77575757575758, "grad_norm": 0.27556318044662476, "learning_rate": 3.342e-05, "loss": 0.4138, "step": 1658 }, { "epoch": 150.87272727272727, "grad_norm": 0.25817060470581055, "learning_rate": 3.341e-05, "loss": 0.3995, "step": 1659 }, { "epoch": 150.96969696969697, "grad_norm": 0.35275375843048096, "learning_rate": 3.3400000000000005e-05, "loss": 0.4112, "step": 1660 }, { "epoch": 150.96969696969697, "eval_loss": 0.4743495285511017, "eval_runtime": 2.1628, "eval_samples_per_second": 25.43, "eval_steps_per_second": 3.237, "step": 1660 }, { "epoch": 151.0, "grad_norm": 0.4107286334037781, "learning_rate": 3.339e-05, "loss": 0.3963, "step": 1661 }, { "epoch": 151.0969696969697, "grad_norm": 0.3202429711818695, "learning_rate": 3.338e-05, "loss": 0.456, "step": 1662 }, { "epoch": 151.1939393939394, "grad_norm": 0.2752574384212494, "learning_rate": 3.337e-05, "loss": 0.3696, "step": 1663 }, { "epoch": 151.29090909090908, "grad_norm": 0.24195246398448944, "learning_rate": 3.336e-05, "loss": 0.3814, "step": 1664 }, { "epoch": 151.38787878787878, "grad_norm": 0.2573438584804535, "learning_rate": 3.3350000000000004e-05, "loss": 0.4085, "step": 1665 }, { "epoch": 151.4848484848485, "grad_norm": 0.29553788900375366, "learning_rate": 3.3339999999999996e-05, "loss": 0.4757, "step": 1666 }, { "epoch": 151.5818181818182, "grad_norm": 0.29257678985595703, "learning_rate": 3.333e-05, "loss": 0.4003, "step": 1667 }, { "epoch": 151.6787878787879, "grad_norm": 0.28193211555480957, "learning_rate": 3.332e-05, "loss": 0.3916, "step": 1668 }, { "epoch": 151.77575757575758, "grad_norm": 0.2794460654258728, "learning_rate": 3.3310000000000005e-05, "loss": 0.3924, "step": 1669 }, { "epoch": 151.87272727272727, "grad_norm": 0.3853585124015808, "learning_rate": 3.33e-05, "loss": 0.4365, "step": 1670 }, { "epoch": 151.87272727272727, "eval_loss": 0.47319746017456055, "eval_runtime": 2.1647, "eval_samples_per_second": 25.407, "eval_steps_per_second": 3.234, "step": 1670 }, { "epoch": 151.96969696969697, "grad_norm": 0.27314481139183044, "learning_rate": 3.329e-05, "loss": 0.4107, "step": 1671 }, { "epoch": 152.0, "grad_norm": 0.4267328083515167, "learning_rate": 3.328e-05, "loss": 0.4639, "step": 1672 }, { "epoch": 152.0969696969697, "grad_norm": 0.38574451208114624, "learning_rate": 3.327e-05, "loss": 0.4079, "step": 1673 }, { "epoch": 152.1939393939394, "grad_norm": 0.29176294803619385, "learning_rate": 3.3260000000000003e-05, "loss": 0.4114, "step": 1674 }, { "epoch": 152.29090909090908, "grad_norm": 0.2615834176540375, "learning_rate": 3.325e-05, "loss": 0.422, "step": 1675 }, { "epoch": 152.38787878787878, "grad_norm": 0.2873329818248749, "learning_rate": 3.324e-05, "loss": 0.4061, "step": 1676 }, { "epoch": 152.4848484848485, "grad_norm": 0.2655068635940552, "learning_rate": 3.323e-05, "loss": 0.354, "step": 1677 }, { "epoch": 152.5818181818182, "grad_norm": 0.2940961420536041, "learning_rate": 3.3220000000000004e-05, "loss": 0.4342, "step": 1678 }, { "epoch": 152.6787878787879, "grad_norm": 0.2879832684993744, "learning_rate": 3.321e-05, "loss": 0.4436, "step": 1679 }, { "epoch": 152.77575757575758, "grad_norm": 0.2613934874534607, "learning_rate": 3.32e-05, "loss": 0.3811, "step": 1680 }, { "epoch": 152.77575757575758, "eval_loss": 0.47243139147758484, "eval_runtime": 2.1715, "eval_samples_per_second": 25.328, "eval_steps_per_second": 3.224, "step": 1680 }, { "epoch": 152.87272727272727, "grad_norm": 0.2908010482788086, "learning_rate": 3.319e-05, "loss": 0.416, "step": 1681 }, { "epoch": 152.96969696969697, "grad_norm": 0.28120896220207214, "learning_rate": 3.318e-05, "loss": 0.4437, "step": 1682 }, { "epoch": 153.0, "grad_norm": 0.46632063388824463, "learning_rate": 3.317e-05, "loss": 0.4474, "step": 1683 }, { "epoch": 153.0969696969697, "grad_norm": 0.25776177644729614, "learning_rate": 3.316e-05, "loss": 0.3701, "step": 1684 }, { "epoch": 153.1939393939394, "grad_norm": 0.3227159380912781, "learning_rate": 3.3150000000000006e-05, "loss": 0.4368, "step": 1685 }, { "epoch": 153.29090909090908, "grad_norm": 0.27070197463035583, "learning_rate": 3.314e-05, "loss": 0.3981, "step": 1686 }, { "epoch": 153.38787878787878, "grad_norm": 0.29919660091400146, "learning_rate": 3.313e-05, "loss": 0.4038, "step": 1687 }, { "epoch": 153.4848484848485, "grad_norm": 0.25041720271110535, "learning_rate": 3.312e-05, "loss": 0.3935, "step": 1688 }, { "epoch": 153.5818181818182, "grad_norm": 0.29702797532081604, "learning_rate": 3.311e-05, "loss": 0.436, "step": 1689 }, { "epoch": 153.6787878787879, "grad_norm": 0.2959851026535034, "learning_rate": 3.3100000000000005e-05, "loss": 0.4883, "step": 1690 }, { "epoch": 153.6787878787879, "eval_loss": 0.47213518619537354, "eval_runtime": 2.165, "eval_samples_per_second": 25.404, "eval_steps_per_second": 3.233, "step": 1690 }, { "epoch": 153.77575757575758, "grad_norm": 0.2581026256084442, "learning_rate": 3.309e-05, "loss": 0.4061, "step": 1691 }, { "epoch": 153.87272727272727, "grad_norm": 0.2597564458847046, "learning_rate": 3.308e-05, "loss": 0.4079, "step": 1692 }, { "epoch": 153.96969696969697, "grad_norm": 0.2366248518228531, "learning_rate": 3.307e-05, "loss": 0.3716, "step": 1693 }, { "epoch": 154.0, "grad_norm": 0.5981835722923279, "learning_rate": 3.3060000000000005e-05, "loss": 0.4349, "step": 1694 }, { "epoch": 154.0969696969697, "grad_norm": 0.3014335036277771, "learning_rate": 3.3050000000000004e-05, "loss": 0.4638, "step": 1695 }, { "epoch": 154.1939393939394, "grad_norm": 0.24050500988960266, "learning_rate": 3.304e-05, "loss": 0.386, "step": 1696 }, { "epoch": 154.29090909090908, "grad_norm": 0.2561618685722351, "learning_rate": 3.303e-05, "loss": 0.3917, "step": 1697 }, { "epoch": 154.38787878787878, "grad_norm": 0.2533642649650574, "learning_rate": 3.302e-05, "loss": 0.3893, "step": 1698 }, { "epoch": 154.4848484848485, "grad_norm": 0.314961314201355, "learning_rate": 3.3010000000000004e-05, "loss": 0.4156, "step": 1699 }, { "epoch": 154.5818181818182, "grad_norm": 0.28019827604293823, "learning_rate": 3.3e-05, "loss": 0.4582, "step": 1700 }, { "epoch": 154.5818181818182, "eval_loss": 0.47177475690841675, "eval_runtime": 2.1504, "eval_samples_per_second": 25.577, "eval_steps_per_second": 3.255, "step": 1700 }, { "epoch": 154.6787878787879, "grad_norm": 0.27833741903305054, "learning_rate": 3.299e-05, "loss": 0.4155, "step": 1701 }, { "epoch": 154.77575757575758, "grad_norm": 0.2910160720348358, "learning_rate": 3.298e-05, "loss": 0.4179, "step": 1702 }, { "epoch": 154.87272727272727, "grad_norm": 0.26338979601860046, "learning_rate": 3.297e-05, "loss": 0.4019, "step": 1703 }, { "epoch": 154.96969696969697, "grad_norm": 0.23985280096530914, "learning_rate": 3.296e-05, "loss": 0.3711, "step": 1704 }, { "epoch": 155.0, "grad_norm": 0.4723038971424103, "learning_rate": 3.295e-05, "loss": 0.4152, "step": 1705 }, { "epoch": 155.0969696969697, "grad_norm": 0.27281108498573303, "learning_rate": 3.2940000000000006e-05, "loss": 0.4006, "step": 1706 }, { "epoch": 155.1939393939394, "grad_norm": 0.22958678007125854, "learning_rate": 3.293e-05, "loss": 0.3824, "step": 1707 }, { "epoch": 155.29090909090908, "grad_norm": 0.23069177567958832, "learning_rate": 3.292e-05, "loss": 0.4577, "step": 1708 }, { "epoch": 155.38787878787878, "grad_norm": 0.3019179403781891, "learning_rate": 3.291e-05, "loss": 0.4202, "step": 1709 }, { "epoch": 155.4848484848485, "grad_norm": 0.3441668450832367, "learning_rate": 3.29e-05, "loss": 0.4257, "step": 1710 }, { "epoch": 155.4848484848485, "eval_loss": 0.4712180495262146, "eval_runtime": 2.1716, "eval_samples_per_second": 25.327, "eval_steps_per_second": 3.223, "step": 1710 }, { "epoch": 155.5818181818182, "grad_norm": 0.2304125428199768, "learning_rate": 3.2890000000000005e-05, "loss": 0.4019, "step": 1711 }, { "epoch": 155.6787878787879, "grad_norm": 0.3568699359893799, "learning_rate": 3.288e-05, "loss": 0.4266, "step": 1712 }, { "epoch": 155.77575757575758, "grad_norm": 0.35448890924453735, "learning_rate": 3.287e-05, "loss": 0.3887, "step": 1713 }, { "epoch": 155.87272727272727, "grad_norm": 0.2856895327568054, "learning_rate": 3.286e-05, "loss": 0.3936, "step": 1714 }, { "epoch": 155.96969696969697, "grad_norm": 0.26315397024154663, "learning_rate": 3.2850000000000006e-05, "loss": 0.4024, "step": 1715 }, { "epoch": 156.0, "grad_norm": 0.49158674478530884, "learning_rate": 3.2840000000000004e-05, "loss": 0.4161, "step": 1716 }, { "epoch": 156.0969696969697, "grad_norm": 0.2861306071281433, "learning_rate": 3.283e-05, "loss": 0.4164, "step": 1717 }, { "epoch": 156.1939393939394, "grad_norm": 0.2937961220741272, "learning_rate": 3.282e-05, "loss": 0.4228, "step": 1718 }, { "epoch": 156.29090909090908, "grad_norm": 0.26782360672950745, "learning_rate": 3.281e-05, "loss": 0.4491, "step": 1719 }, { "epoch": 156.38787878787878, "grad_norm": 0.30058830976486206, "learning_rate": 3.2800000000000004e-05, "loss": 0.3851, "step": 1720 }, { "epoch": 156.38787878787878, "eval_loss": 0.47035104036331177, "eval_runtime": 2.1673, "eval_samples_per_second": 25.377, "eval_steps_per_second": 3.23, "step": 1720 }, { "epoch": 156.4848484848485, "grad_norm": 0.22291842103004456, "learning_rate": 3.279e-05, "loss": 0.3677, "step": 1721 }, { "epoch": 156.5818181818182, "grad_norm": 0.26373815536499023, "learning_rate": 3.278e-05, "loss": 0.4028, "step": 1722 }, { "epoch": 156.6787878787879, "grad_norm": 0.2856782078742981, "learning_rate": 3.277e-05, "loss": 0.4263, "step": 1723 }, { "epoch": 156.77575757575758, "grad_norm": 0.24891617894172668, "learning_rate": 3.2760000000000005e-05, "loss": 0.428, "step": 1724 }, { "epoch": 156.87272727272727, "grad_norm": 0.3333253860473633, "learning_rate": 3.275e-05, "loss": 0.4135, "step": 1725 }, { "epoch": 156.96969696969697, "grad_norm": 0.2727908194065094, "learning_rate": 3.274e-05, "loss": 0.3742, "step": 1726 }, { "epoch": 157.0, "grad_norm": 0.5115933418273926, "learning_rate": 3.273e-05, "loss": 0.4297, "step": 1727 }, { "epoch": 157.0969696969697, "grad_norm": 0.2451162487268448, "learning_rate": 3.272e-05, "loss": 0.4076, "step": 1728 }, { "epoch": 157.1939393939394, "grad_norm": 0.32220616936683655, "learning_rate": 3.2710000000000004e-05, "loss": 0.4132, "step": 1729 }, { "epoch": 157.29090909090908, "grad_norm": 0.26839929819107056, "learning_rate": 3.27e-05, "loss": 0.4217, "step": 1730 }, { "epoch": 157.29090909090908, "eval_loss": 0.4702255129814148, "eval_runtime": 2.1582, "eval_samples_per_second": 25.484, "eval_steps_per_second": 3.243, "step": 1730 }, { "epoch": 157.38787878787878, "grad_norm": 0.26511865854263306, "learning_rate": 3.269000000000001e-05, "loss": 0.3892, "step": 1731 }, { "epoch": 157.4848484848485, "grad_norm": 0.2777853310108185, "learning_rate": 3.268e-05, "loss": 0.4259, "step": 1732 }, { "epoch": 157.5818181818182, "grad_norm": 0.32341113686561584, "learning_rate": 3.267e-05, "loss": 0.3851, "step": 1733 }, { "epoch": 157.6787878787879, "grad_norm": 0.29954448342323303, "learning_rate": 3.266e-05, "loss": 0.4389, "step": 1734 }, { "epoch": 157.77575757575758, "grad_norm": 0.24362747371196747, "learning_rate": 3.265e-05, "loss": 0.404, "step": 1735 }, { "epoch": 157.87272727272727, "grad_norm": 0.27364447712898254, "learning_rate": 3.2640000000000006e-05, "loss": 0.445, "step": 1736 }, { "epoch": 157.96969696969697, "grad_norm": 0.26995348930358887, "learning_rate": 3.263e-05, "loss": 0.3549, "step": 1737 }, { "epoch": 158.0, "grad_norm": 0.5283206105232239, "learning_rate": 3.262e-05, "loss": 0.4122, "step": 1738 }, { "epoch": 158.0969696969697, "grad_norm": 0.30771592259407043, "learning_rate": 3.261e-05, "loss": 0.4484, "step": 1739 }, { "epoch": 158.1939393939394, "grad_norm": 0.2356712818145752, "learning_rate": 3.26e-05, "loss": 0.4009, "step": 1740 }, { "epoch": 158.1939393939394, "eval_loss": 0.46936678886413574, "eval_runtime": 2.1604, "eval_samples_per_second": 25.458, "eval_steps_per_second": 3.24, "step": 1740 }, { "epoch": 158.29090909090908, "grad_norm": 0.2755451500415802, "learning_rate": 3.2590000000000005e-05, "loss": 0.3879, "step": 1741 }, { "epoch": 158.38787878787878, "grad_norm": 0.3021290898323059, "learning_rate": 3.2579999999999996e-05, "loss": 0.3837, "step": 1742 }, { "epoch": 158.4848484848485, "grad_norm": 0.33612263202667236, "learning_rate": 3.257e-05, "loss": 0.4023, "step": 1743 }, { "epoch": 158.5818181818182, "grad_norm": 0.29134008288383484, "learning_rate": 3.256e-05, "loss": 0.3893, "step": 1744 }, { "epoch": 158.6787878787879, "grad_norm": 0.28086304664611816, "learning_rate": 3.2550000000000005e-05, "loss": 0.4277, "step": 1745 }, { "epoch": 158.77575757575758, "grad_norm": 0.298905611038208, "learning_rate": 3.2540000000000004e-05, "loss": 0.4219, "step": 1746 }, { "epoch": 158.87272727272727, "grad_norm": 0.23620347678661346, "learning_rate": 3.253e-05, "loss": 0.3694, "step": 1747 }, { "epoch": 158.96969696969697, "grad_norm": 0.34921467304229736, "learning_rate": 3.252e-05, "loss": 0.4472, "step": 1748 }, { "epoch": 159.0, "grad_norm": 0.4663753807544708, "learning_rate": 3.251e-05, "loss": 0.411, "step": 1749 }, { "epoch": 159.0969696969697, "grad_norm": 0.35847172141075134, "learning_rate": 3.2500000000000004e-05, "loss": 0.4179, "step": 1750 }, { "epoch": 159.0969696969697, "eval_loss": 0.4685215353965759, "eval_runtime": 2.1614, "eval_samples_per_second": 25.446, "eval_steps_per_second": 3.239, "step": 1750 }, { "epoch": 159.1939393939394, "grad_norm": 0.282756507396698, "learning_rate": 3.249e-05, "loss": 0.409, "step": 1751 }, { "epoch": 159.29090909090908, "grad_norm": 0.2560710310935974, "learning_rate": 3.248e-05, "loss": 0.4262, "step": 1752 }, { "epoch": 159.38787878787878, "grad_norm": 0.26294830441474915, "learning_rate": 3.247e-05, "loss": 0.343, "step": 1753 }, { "epoch": 159.4848484848485, "grad_norm": 0.3224695324897766, "learning_rate": 3.2460000000000004e-05, "loss": 0.4206, "step": 1754 }, { "epoch": 159.5818181818182, "grad_norm": 0.28522032499313354, "learning_rate": 3.245e-05, "loss": 0.3707, "step": 1755 }, { "epoch": 159.6787878787879, "grad_norm": 0.34134167432785034, "learning_rate": 3.244e-05, "loss": 0.4514, "step": 1756 }, { "epoch": 159.77575757575758, "grad_norm": 0.24785830080509186, "learning_rate": 3.243e-05, "loss": 0.3801, "step": 1757 }, { "epoch": 159.87272727272727, "grad_norm": 0.35771289467811584, "learning_rate": 3.242e-05, "loss": 0.3888, "step": 1758 }, { "epoch": 159.96969696969697, "grad_norm": 0.40290340781211853, "learning_rate": 3.241e-05, "loss": 0.4668, "step": 1759 }, { "epoch": 160.0, "grad_norm": 0.5135341286659241, "learning_rate": 3.24e-05, "loss": 0.3879, "step": 1760 }, { "epoch": 160.0, "eval_loss": 0.4681518077850342, "eval_runtime": 2.1812, "eval_samples_per_second": 25.215, "eval_steps_per_second": 3.209, "step": 1760 }, { "epoch": 160.0969696969697, "grad_norm": 0.2755342423915863, "learning_rate": 3.239000000000001e-05, "loss": 0.3986, "step": 1761 }, { "epoch": 160.1939393939394, "grad_norm": 0.34573477506637573, "learning_rate": 3.238e-05, "loss": 0.4542, "step": 1762 }, { "epoch": 160.29090909090908, "grad_norm": 0.31546109914779663, "learning_rate": 3.2370000000000003e-05, "loss": 0.4341, "step": 1763 }, { "epoch": 160.38787878787878, "grad_norm": 0.3111988306045532, "learning_rate": 3.236e-05, "loss": 0.4386, "step": 1764 }, { "epoch": 160.4848484848485, "grad_norm": 0.33897435665130615, "learning_rate": 3.235e-05, "loss": 0.3434, "step": 1765 }, { "epoch": 160.5818181818182, "grad_norm": 0.27282023429870605, "learning_rate": 3.2340000000000005e-05, "loss": 0.4253, "step": 1766 }, { "epoch": 160.6787878787879, "grad_norm": 0.25652244687080383, "learning_rate": 3.233e-05, "loss": 0.4284, "step": 1767 }, { "epoch": 160.77575757575758, "grad_norm": 0.3286043107509613, "learning_rate": 3.232e-05, "loss": 0.3984, "step": 1768 }, { "epoch": 160.87272727272727, "grad_norm": 0.34483659267425537, "learning_rate": 3.231e-05, "loss": 0.3858, "step": 1769 }, { "epoch": 160.96969696969697, "grad_norm": 0.28678274154663086, "learning_rate": 3.2300000000000006e-05, "loss": 0.347, "step": 1770 }, { "epoch": 160.96969696969697, "eval_loss": 0.46771591901779175, "eval_runtime": 2.1478, "eval_samples_per_second": 25.607, "eval_steps_per_second": 3.259, "step": 1770 }, { "epoch": 161.0, "grad_norm": 0.4694156348705292, "learning_rate": 3.2290000000000004e-05, "loss": 0.4208, "step": 1771 }, { "epoch": 161.0969696969697, "grad_norm": 0.2683364152908325, "learning_rate": 3.2279999999999996e-05, "loss": 0.4647, "step": 1772 }, { "epoch": 161.1939393939394, "grad_norm": 0.27918240427970886, "learning_rate": 3.227e-05, "loss": 0.4131, "step": 1773 }, { "epoch": 161.29090909090908, "grad_norm": 0.25537046790122986, "learning_rate": 3.226e-05, "loss": 0.3655, "step": 1774 }, { "epoch": 161.38787878787878, "grad_norm": 0.2649289071559906, "learning_rate": 3.2250000000000005e-05, "loss": 0.4214, "step": 1775 }, { "epoch": 161.4848484848485, "grad_norm": 0.3552446663379669, "learning_rate": 3.224e-05, "loss": 0.416, "step": 1776 }, { "epoch": 161.5818181818182, "grad_norm": 0.2456580251455307, "learning_rate": 3.223e-05, "loss": 0.3932, "step": 1777 }, { "epoch": 161.6787878787879, "grad_norm": 0.32637685537338257, "learning_rate": 3.222e-05, "loss": 0.4071, "step": 1778 }, { "epoch": 161.77575757575758, "grad_norm": 0.3025254011154175, "learning_rate": 3.221e-05, "loss": 0.3814, "step": 1779 }, { "epoch": 161.87272727272727, "grad_norm": 0.2737649977207184, "learning_rate": 3.2200000000000003e-05, "loss": 0.3886, "step": 1780 }, { "epoch": 161.87272727272727, "eval_loss": 0.4672313332557678, "eval_runtime": 2.1526, "eval_samples_per_second": 25.55, "eval_steps_per_second": 3.252, "step": 1780 }, { "epoch": 161.96969696969697, "grad_norm": 0.25991830229759216, "learning_rate": 3.219e-05, "loss": 0.4104, "step": 1781 }, { "epoch": 162.0, "grad_norm": 0.4407697021961212, "learning_rate": 3.218e-05, "loss": 0.3704, "step": 1782 }, { "epoch": 162.0969696969697, "grad_norm": 0.25895529985427856, "learning_rate": 3.217e-05, "loss": 0.4313, "step": 1783 }, { "epoch": 162.1939393939394, "grad_norm": 0.26714879274368286, "learning_rate": 3.2160000000000004e-05, "loss": 0.4393, "step": 1784 }, { "epoch": 162.29090909090908, "grad_norm": 0.41708916425704956, "learning_rate": 3.215e-05, "loss": 0.39, "step": 1785 }, { "epoch": 162.38787878787878, "grad_norm": 0.28017428517341614, "learning_rate": 3.214e-05, "loss": 0.4031, "step": 1786 }, { "epoch": 162.4848484848485, "grad_norm": 0.3049089312553406, "learning_rate": 3.213e-05, "loss": 0.3572, "step": 1787 }, { "epoch": 162.5818181818182, "grad_norm": 0.2849743664264679, "learning_rate": 3.212e-05, "loss": 0.3748, "step": 1788 }, { "epoch": 162.6787878787879, "grad_norm": 0.2804197371006012, "learning_rate": 3.211e-05, "loss": 0.4213, "step": 1789 }, { "epoch": 162.77575757575758, "grad_norm": 0.24261786043643951, "learning_rate": 3.21e-05, "loss": 0.4041, "step": 1790 }, { "epoch": 162.77575757575758, "eval_loss": 0.46590346097946167, "eval_runtime": 2.1701, "eval_samples_per_second": 25.345, "eval_steps_per_second": 3.226, "step": 1790 }, { "epoch": 162.87272727272727, "grad_norm": 0.3327369689941406, "learning_rate": 3.2090000000000006e-05, "loss": 0.4168, "step": 1791 }, { "epoch": 162.96969696969697, "grad_norm": 0.3327149748802185, "learning_rate": 3.208e-05, "loss": 0.3969, "step": 1792 }, { "epoch": 163.0, "grad_norm": 0.7095963358879089, "learning_rate": 3.207e-05, "loss": 0.4276, "step": 1793 }, { "epoch": 163.0969696969697, "grad_norm": 0.2949429452419281, "learning_rate": 3.206e-05, "loss": 0.4401, "step": 1794 }, { "epoch": 163.1939393939394, "grad_norm": 0.2918664813041687, "learning_rate": 3.205e-05, "loss": 0.4117, "step": 1795 }, { "epoch": 163.29090909090908, "grad_norm": 0.3940178453922272, "learning_rate": 3.2040000000000005e-05, "loss": 0.4197, "step": 1796 }, { "epoch": 163.38787878787878, "grad_norm": 0.27344033122062683, "learning_rate": 3.2029999999999997e-05, "loss": 0.3974, "step": 1797 }, { "epoch": 163.4848484848485, "grad_norm": 0.33115363121032715, "learning_rate": 3.202e-05, "loss": 0.4082, "step": 1798 }, { "epoch": 163.5818181818182, "grad_norm": 0.29205378890037537, "learning_rate": 3.201e-05, "loss": 0.4047, "step": 1799 }, { "epoch": 163.6787878787879, "grad_norm": 0.29947832226753235, "learning_rate": 3.2000000000000005e-05, "loss": 0.3817, "step": 1800 }, { "epoch": 163.6787878787879, "eval_loss": 0.4654845893383026, "eval_runtime": 2.1572, "eval_samples_per_second": 25.496, "eval_steps_per_second": 3.245, "step": 1800 }, { "epoch": 163.77575757575758, "grad_norm": 0.32452553510665894, "learning_rate": 3.1990000000000004e-05, "loss": 0.3824, "step": 1801 }, { "epoch": 163.87272727272727, "grad_norm": 0.24157431721687317, "learning_rate": 3.198e-05, "loss": 0.4015, "step": 1802 }, { "epoch": 163.96969696969697, "grad_norm": 0.30334722995758057, "learning_rate": 3.197e-05, "loss": 0.4231, "step": 1803 }, { "epoch": 164.0, "grad_norm": 0.47599658370018005, "learning_rate": 3.196e-05, "loss": 0.296, "step": 1804 }, { "epoch": 164.0969696969697, "grad_norm": 0.23312796652317047, "learning_rate": 3.1950000000000004e-05, "loss": 0.4002, "step": 1805 }, { "epoch": 164.1939393939394, "grad_norm": 0.302068293094635, "learning_rate": 3.194e-05, "loss": 0.3674, "step": 1806 }, { "epoch": 164.29090909090908, "grad_norm": 0.3966183662414551, "learning_rate": 3.193e-05, "loss": 0.4546, "step": 1807 }, { "epoch": 164.38787878787878, "grad_norm": 0.25288668274879456, "learning_rate": 3.192e-05, "loss": 0.4078, "step": 1808 }, { "epoch": 164.4848484848485, "grad_norm": 0.28041747212409973, "learning_rate": 3.191e-05, "loss": 0.3804, "step": 1809 }, { "epoch": 164.5818181818182, "grad_norm": 0.2514358460903168, "learning_rate": 3.19e-05, "loss": 0.3812, "step": 1810 }, { "epoch": 164.5818181818182, "eval_loss": 0.4653727412223816, "eval_runtime": 2.1628, "eval_samples_per_second": 25.43, "eval_steps_per_second": 3.237, "step": 1810 }, { "epoch": 164.6787878787879, "grad_norm": 0.3115096092224121, "learning_rate": 3.189e-05, "loss": 0.4226, "step": 1811 }, { "epoch": 164.77575757575758, "grad_norm": 0.28320327401161194, "learning_rate": 3.188e-05, "loss": 0.3643, "step": 1812 }, { "epoch": 164.87272727272727, "grad_norm": 0.3034592866897583, "learning_rate": 3.187e-05, "loss": 0.4262, "step": 1813 }, { "epoch": 164.96969696969697, "grad_norm": 0.33773717284202576, "learning_rate": 3.186e-05, "loss": 0.4256, "step": 1814 }, { "epoch": 165.0, "grad_norm": 0.5290171504020691, "learning_rate": 3.185e-05, "loss": 0.3871, "step": 1815 }, { "epoch": 165.0969696969697, "grad_norm": 0.25624457001686096, "learning_rate": 3.184e-05, "loss": 0.4394, "step": 1816 }, { "epoch": 165.1939393939394, "grad_norm": 0.25744593143463135, "learning_rate": 3.1830000000000005e-05, "loss": 0.4091, "step": 1817 }, { "epoch": 165.29090909090908, "grad_norm": 0.339099258184433, "learning_rate": 3.182e-05, "loss": 0.3906, "step": 1818 }, { "epoch": 165.38787878787878, "grad_norm": 0.30119866132736206, "learning_rate": 3.181e-05, "loss": 0.4274, "step": 1819 }, { "epoch": 165.4848484848485, "grad_norm": 0.32787880301475525, "learning_rate": 3.18e-05, "loss": 0.441, "step": 1820 }, { "epoch": 165.4848484848485, "eval_loss": 0.4649874269962311, "eval_runtime": 2.1606, "eval_samples_per_second": 25.456, "eval_steps_per_second": 3.24, "step": 1820 }, { "epoch": 165.5818181818182, "grad_norm": 0.316469669342041, "learning_rate": 3.1790000000000006e-05, "loss": 0.3948, "step": 1821 }, { "epoch": 165.6787878787879, "grad_norm": 0.28067582845687866, "learning_rate": 3.1780000000000004e-05, "loss": 0.3742, "step": 1822 }, { "epoch": 165.77575757575758, "grad_norm": 0.2999702990055084, "learning_rate": 3.177e-05, "loss": 0.3831, "step": 1823 }, { "epoch": 165.87272727272727, "grad_norm": 0.22824245691299438, "learning_rate": 3.176e-05, "loss": 0.3727, "step": 1824 }, { "epoch": 165.96969696969697, "grad_norm": 0.3537152409553528, "learning_rate": 3.175e-05, "loss": 0.4028, "step": 1825 }, { "epoch": 166.0, "grad_norm": 0.49454474449157715, "learning_rate": 3.1740000000000004e-05, "loss": 0.3595, "step": 1826 }, { "epoch": 166.0969696969697, "grad_norm": 0.3131580054759979, "learning_rate": 3.173e-05, "loss": 0.4085, "step": 1827 }, { "epoch": 166.1939393939394, "grad_norm": 0.3912461996078491, "learning_rate": 3.172e-05, "loss": 0.4287, "step": 1828 }, { "epoch": 166.29090909090908, "grad_norm": 0.30655741691589355, "learning_rate": 3.171e-05, "loss": 0.4043, "step": 1829 }, { "epoch": 166.38787878787878, "grad_norm": 0.3822678029537201, "learning_rate": 3.1700000000000005e-05, "loss": 0.3814, "step": 1830 }, { "epoch": 166.38787878787878, "eval_loss": 0.4645639955997467, "eval_runtime": 2.1598, "eval_samples_per_second": 25.465, "eval_steps_per_second": 3.241, "step": 1830 }, { "epoch": 166.4848484848485, "grad_norm": 0.346089243888855, "learning_rate": 3.169e-05, "loss": 0.3884, "step": 1831 }, { "epoch": 166.5818181818182, "grad_norm": 0.2965271472930908, "learning_rate": 3.168e-05, "loss": 0.4098, "step": 1832 }, { "epoch": 166.6787878787879, "grad_norm": 0.29764848947525024, "learning_rate": 3.167e-05, "loss": 0.4053, "step": 1833 }, { "epoch": 166.77575757575758, "grad_norm": 0.24490293860435486, "learning_rate": 3.166e-05, "loss": 0.3439, "step": 1834 }, { "epoch": 166.87272727272727, "grad_norm": 0.28927063941955566, "learning_rate": 3.1650000000000004e-05, "loss": 0.4155, "step": 1835 }, { "epoch": 166.96969696969697, "grad_norm": 0.27231332659721375, "learning_rate": 3.164e-05, "loss": 0.4291, "step": 1836 }, { "epoch": 167.0, "grad_norm": 0.5904393792152405, "learning_rate": 3.163000000000001e-05, "loss": 0.4002, "step": 1837 }, { "epoch": 167.0969696969697, "grad_norm": 0.30065590143203735, "learning_rate": 3.162e-05, "loss": 0.4008, "step": 1838 }, { "epoch": 167.1939393939394, "grad_norm": 0.2866343855857849, "learning_rate": 3.1610000000000004e-05, "loss": 0.3725, "step": 1839 }, { "epoch": 167.29090909090908, "grad_norm": 0.2981482744216919, "learning_rate": 3.16e-05, "loss": 0.4071, "step": 1840 }, { "epoch": 167.29090909090908, "eval_loss": 0.46402543783187866, "eval_runtime": 2.1685, "eval_samples_per_second": 25.363, "eval_steps_per_second": 3.228, "step": 1840 }, { "epoch": 167.38787878787878, "grad_norm": 0.2863614857196808, "learning_rate": 3.159e-05, "loss": 0.3969, "step": 1841 }, { "epoch": 167.4848484848485, "grad_norm": 0.276218056678772, "learning_rate": 3.1580000000000006e-05, "loss": 0.4347, "step": 1842 }, { "epoch": 167.5818181818182, "grad_norm": 0.29230207204818726, "learning_rate": 3.157e-05, "loss": 0.4602, "step": 1843 }, { "epoch": 167.6787878787879, "grad_norm": 0.36772671341896057, "learning_rate": 3.156e-05, "loss": 0.399, "step": 1844 }, { "epoch": 167.77575757575758, "grad_norm": 0.38044774532318115, "learning_rate": 3.155e-05, "loss": 0.3632, "step": 1845 }, { "epoch": 167.87272727272727, "grad_norm": 0.2674912214279175, "learning_rate": 3.154e-05, "loss": 0.397, "step": 1846 }, { "epoch": 167.96969696969697, "grad_norm": 0.3134581446647644, "learning_rate": 3.1530000000000005e-05, "loss": 0.3744, "step": 1847 }, { "epoch": 168.0, "grad_norm": 0.4266735017299652, "learning_rate": 3.1519999999999996e-05, "loss": 0.3918, "step": 1848 }, { "epoch": 168.0969696969697, "grad_norm": 0.3174222409725189, "learning_rate": 3.151e-05, "loss": 0.3921, "step": 1849 }, { "epoch": 168.1939393939394, "grad_norm": 0.2756790518760681, "learning_rate": 3.15e-05, "loss": 0.3856, "step": 1850 }, { "epoch": 168.1939393939394, "eval_loss": 0.4628044366836548, "eval_runtime": 2.209, "eval_samples_per_second": 24.898, "eval_steps_per_second": 3.169, "step": 1850 }, { "epoch": 168.29090909090908, "grad_norm": 0.290212482213974, "learning_rate": 3.1490000000000005e-05, "loss": 0.4029, "step": 1851 }, { "epoch": 168.38787878787878, "grad_norm": 0.3511327803134918, "learning_rate": 3.1480000000000004e-05, "loss": 0.3939, "step": 1852 }, { "epoch": 168.4848484848485, "grad_norm": 0.2698800563812256, "learning_rate": 3.147e-05, "loss": 0.4087, "step": 1853 }, { "epoch": 168.5818181818182, "grad_norm": 0.30846282839775085, "learning_rate": 3.146e-05, "loss": 0.3729, "step": 1854 }, { "epoch": 168.6787878787879, "grad_norm": 0.32614874839782715, "learning_rate": 3.145e-05, "loss": 0.4484, "step": 1855 }, { "epoch": 168.77575757575758, "grad_norm": 0.27173444628715515, "learning_rate": 3.1440000000000004e-05, "loss": 0.3884, "step": 1856 }, { "epoch": 168.87272727272727, "grad_norm": 0.2709703743457794, "learning_rate": 3.143e-05, "loss": 0.3991, "step": 1857 }, { "epoch": 168.96969696969697, "grad_norm": 0.24700695276260376, "learning_rate": 3.142e-05, "loss": 0.3884, "step": 1858 }, { "epoch": 169.0, "grad_norm": 0.48396357893943787, "learning_rate": 3.141e-05, "loss": 0.4496, "step": 1859 }, { "epoch": 169.0969696969697, "grad_norm": 0.285308837890625, "learning_rate": 3.1400000000000004e-05, "loss": 0.4341, "step": 1860 }, { "epoch": 169.0969696969697, "eval_loss": 0.4620818495750427, "eval_runtime": 2.157, "eval_samples_per_second": 25.499, "eval_steps_per_second": 3.245, "step": 1860 }, { "epoch": 169.1939393939394, "grad_norm": 0.26520836353302, "learning_rate": 3.139e-05, "loss": 0.3765, "step": 1861 }, { "epoch": 169.29090909090908, "grad_norm": 0.2857014536857605, "learning_rate": 3.138e-05, "loss": 0.402, "step": 1862 }, { "epoch": 169.38787878787878, "grad_norm": 0.3332759737968445, "learning_rate": 3.137e-05, "loss": 0.3689, "step": 1863 }, { "epoch": 169.4848484848485, "grad_norm": 0.29484209418296814, "learning_rate": 3.136e-05, "loss": 0.4029, "step": 1864 }, { "epoch": 169.5818181818182, "grad_norm": 0.3884888291358948, "learning_rate": 3.135e-05, "loss": 0.4124, "step": 1865 }, { "epoch": 169.6787878787879, "grad_norm": 0.31348225474357605, "learning_rate": 3.134e-05, "loss": 0.3802, "step": 1866 }, { "epoch": 169.77575757575758, "grad_norm": 0.2756951153278351, "learning_rate": 3.133000000000001e-05, "loss": 0.432, "step": 1867 }, { "epoch": 169.87272727272727, "grad_norm": 0.2762760519981384, "learning_rate": 3.132e-05, "loss": 0.3936, "step": 1868 }, { "epoch": 169.96969696969697, "grad_norm": 0.27015718817710876, "learning_rate": 3.1310000000000003e-05, "loss": 0.4193, "step": 1869 }, { "epoch": 170.0, "grad_norm": 0.46624138951301575, "learning_rate": 3.13e-05, "loss": 0.2868, "step": 1870 }, { "epoch": 170.0, "eval_loss": 0.4615083634853363, "eval_runtime": 2.1721, "eval_samples_per_second": 25.321, "eval_steps_per_second": 3.223, "step": 1870 }, { "epoch": 170.0969696969697, "grad_norm": 0.3341410756111145, "learning_rate": 3.129e-05, "loss": 0.403, "step": 1871 }, { "epoch": 170.1939393939394, "grad_norm": 0.30921101570129395, "learning_rate": 3.1280000000000005e-05, "loss": 0.3938, "step": 1872 }, { "epoch": 170.29090909090908, "grad_norm": 0.3262888789176941, "learning_rate": 3.127e-05, "loss": 0.4082, "step": 1873 }, { "epoch": 170.38787878787878, "grad_norm": 0.29888027906417847, "learning_rate": 3.126e-05, "loss": 0.3694, "step": 1874 }, { "epoch": 170.4848484848485, "grad_norm": 0.30654269456863403, "learning_rate": 3.125e-05, "loss": 0.3891, "step": 1875 }, { "epoch": 170.5818181818182, "grad_norm": 0.2998983561992645, "learning_rate": 3.1240000000000006e-05, "loss": 0.3869, "step": 1876 }, { "epoch": 170.6787878787879, "grad_norm": 0.29455775022506714, "learning_rate": 3.1230000000000004e-05, "loss": 0.3903, "step": 1877 }, { "epoch": 170.77575757575758, "grad_norm": 0.3592742681503296, "learning_rate": 3.122e-05, "loss": 0.4284, "step": 1878 }, { "epoch": 170.87272727272727, "grad_norm": 0.3325028121471405, "learning_rate": 3.121e-05, "loss": 0.4097, "step": 1879 }, { "epoch": 170.96969696969697, "grad_norm": 0.29504528641700745, "learning_rate": 3.12e-05, "loss": 0.3819, "step": 1880 }, { "epoch": 170.96969696969697, "eval_loss": 0.4609839618206024, "eval_runtime": 2.1984, "eval_samples_per_second": 25.018, "eval_steps_per_second": 3.184, "step": 1880 }, { "epoch": 171.0, "grad_norm": 0.42094412446022034, "learning_rate": 3.1190000000000005e-05, "loss": 0.4709, "step": 1881 }, { "epoch": 171.0969696969697, "grad_norm": 0.26900404691696167, "learning_rate": 3.118e-05, "loss": 0.3885, "step": 1882 }, { "epoch": 171.1939393939394, "grad_norm": 0.36168283224105835, "learning_rate": 3.117e-05, "loss": 0.3713, "step": 1883 }, { "epoch": 171.29090909090908, "grad_norm": 0.28963232040405273, "learning_rate": 3.116e-05, "loss": 0.3786, "step": 1884 }, { "epoch": 171.38787878787878, "grad_norm": 0.30029475688934326, "learning_rate": 3.115e-05, "loss": 0.3851, "step": 1885 }, { "epoch": 171.4848484848485, "grad_norm": 0.2340232878923416, "learning_rate": 3.1140000000000003e-05, "loss": 0.3724, "step": 1886 }, { "epoch": 171.5818181818182, "grad_norm": 0.3094693720340729, "learning_rate": 3.113e-05, "loss": 0.4172, "step": 1887 }, { "epoch": 171.6787878787879, "grad_norm": 0.32682815194129944, "learning_rate": 3.112e-05, "loss": 0.4379, "step": 1888 }, { "epoch": 171.77575757575758, "grad_norm": 0.39302778244018555, "learning_rate": 3.111e-05, "loss": 0.4312, "step": 1889 }, { "epoch": 171.87272727272727, "grad_norm": 0.3695316016674042, "learning_rate": 3.1100000000000004e-05, "loss": 0.4128, "step": 1890 }, { "epoch": 171.87272727272727, "eval_loss": 0.46061545610427856, "eval_runtime": 2.1757, "eval_samples_per_second": 25.279, "eval_steps_per_second": 3.217, "step": 1890 }, { "epoch": 171.96969696969697, "grad_norm": 0.3330514430999756, "learning_rate": 3.109e-05, "loss": 0.3874, "step": 1891 }, { "epoch": 172.0, "grad_norm": 0.37093260884284973, "learning_rate": 3.108e-05, "loss": 0.3705, "step": 1892 }, { "epoch": 172.0969696969697, "grad_norm": 0.23208312690258026, "learning_rate": 3.107e-05, "loss": 0.3781, "step": 1893 }, { "epoch": 172.1939393939394, "grad_norm": 0.2550165355205536, "learning_rate": 3.106e-05, "loss": 0.3863, "step": 1894 }, { "epoch": 172.29090909090908, "grad_norm": 0.35588306188583374, "learning_rate": 3.105e-05, "loss": 0.416, "step": 1895 }, { "epoch": 172.38787878787878, "grad_norm": 0.26319488883018494, "learning_rate": 3.104e-05, "loss": 0.3926, "step": 1896 }, { "epoch": 172.4848484848485, "grad_norm": 0.23338590562343597, "learning_rate": 3.1030000000000006e-05, "loss": 0.3879, "step": 1897 }, { "epoch": 172.5818181818182, "grad_norm": 0.3668774366378784, "learning_rate": 3.102e-05, "loss": 0.3602, "step": 1898 }, { "epoch": 172.6787878787879, "grad_norm": 0.29479148983955383, "learning_rate": 3.101e-05, "loss": 0.3748, "step": 1899 }, { "epoch": 172.77575757575758, "grad_norm": 0.291279673576355, "learning_rate": 3.1e-05, "loss": 0.4041, "step": 1900 }, { "epoch": 172.77575757575758, "eval_loss": 0.46023276448249817, "eval_runtime": 2.1674, "eval_samples_per_second": 25.375, "eval_steps_per_second": 3.23, "step": 1900 }, { "epoch": 172.87272727272727, "grad_norm": 0.2789687514305115, "learning_rate": 3.099e-05, "loss": 0.4038, "step": 1901 }, { "epoch": 172.96969696969697, "grad_norm": 0.283159077167511, "learning_rate": 3.0980000000000005e-05, "loss": 0.4683, "step": 1902 }, { "epoch": 173.0, "grad_norm": 0.5169331431388855, "learning_rate": 3.0969999999999997e-05, "loss": 0.3729, "step": 1903 }, { "epoch": 173.0969696969697, "grad_norm": 0.30403706431388855, "learning_rate": 3.096e-05, "loss": 0.4029, "step": 1904 }, { "epoch": 173.1939393939394, "grad_norm": 0.2558678686618805, "learning_rate": 3.095e-05, "loss": 0.4093, "step": 1905 }, { "epoch": 173.29090909090908, "grad_norm": 0.34730300307273865, "learning_rate": 3.0940000000000005e-05, "loss": 0.3644, "step": 1906 }, { "epoch": 173.38787878787878, "grad_norm": 0.267134428024292, "learning_rate": 3.0930000000000004e-05, "loss": 0.4018, "step": 1907 }, { "epoch": 173.4848484848485, "grad_norm": 0.22669728100299835, "learning_rate": 3.092e-05, "loss": 0.3635, "step": 1908 }, { "epoch": 173.5818181818182, "grad_norm": 0.3618592619895935, "learning_rate": 3.091e-05, "loss": 0.3999, "step": 1909 }, { "epoch": 173.6787878787879, "grad_norm": 0.3263871669769287, "learning_rate": 3.09e-05, "loss": 0.4313, "step": 1910 }, { "epoch": 173.6787878787879, "eval_loss": 0.4603014588356018, "eval_runtime": 2.1594, "eval_samples_per_second": 25.471, "eval_steps_per_second": 3.242, "step": 1910 }, { "epoch": 173.77575757575758, "grad_norm": 0.3253178894519806, "learning_rate": 3.0890000000000004e-05, "loss": 0.4348, "step": 1911 }, { "epoch": 173.87272727272727, "grad_norm": 0.32916221022605896, "learning_rate": 3.088e-05, "loss": 0.3839, "step": 1912 }, { "epoch": 173.96969696969697, "grad_norm": 0.2587529122829437, "learning_rate": 3.087e-05, "loss": 0.3832, "step": 1913 }, { "epoch": 174.0, "grad_norm": 0.45776283740997314, "learning_rate": 3.086e-05, "loss": 0.3451, "step": 1914 }, { "epoch": 174.0969696969697, "grad_norm": 0.2600422203540802, "learning_rate": 3.0850000000000004e-05, "loss": 0.384, "step": 1915 }, { "epoch": 174.1939393939394, "grad_norm": 0.2785820960998535, "learning_rate": 3.084e-05, "loss": 0.3848, "step": 1916 }, { "epoch": 174.29090909090908, "grad_norm": 0.30167168378829956, "learning_rate": 3.083e-05, "loss": 0.4166, "step": 1917 }, { "epoch": 174.38787878787878, "grad_norm": 0.38409626483917236, "learning_rate": 3.082e-05, "loss": 0.3884, "step": 1918 }, { "epoch": 174.4848484848485, "grad_norm": 0.40932610630989075, "learning_rate": 3.081e-05, "loss": 0.3618, "step": 1919 }, { "epoch": 174.5818181818182, "grad_norm": 0.26346346735954285, "learning_rate": 3.08e-05, "loss": 0.3602, "step": 1920 }, { "epoch": 174.5818181818182, "eval_loss": 0.4591878056526184, "eval_runtime": 2.152, "eval_samples_per_second": 25.558, "eval_steps_per_second": 3.253, "step": 1920 }, { "epoch": 174.6787878787879, "grad_norm": 0.260904997587204, "learning_rate": 3.079e-05, "loss": 0.43, "step": 1921 }, { "epoch": 174.77575757575758, "grad_norm": 0.3299517333507538, "learning_rate": 3.078e-05, "loss": 0.4078, "step": 1922 }, { "epoch": 174.87272727272727, "grad_norm": 0.38013601303100586, "learning_rate": 3.077e-05, "loss": 0.422, "step": 1923 }, { "epoch": 174.96969696969697, "grad_norm": 0.27372559905052185, "learning_rate": 3.076e-05, "loss": 0.3849, "step": 1924 }, { "epoch": 175.0, "grad_norm": 0.590491533279419, "learning_rate": 3.075e-05, "loss": 0.4239, "step": 1925 }, { "epoch": 175.0969696969697, "grad_norm": 0.25433579087257385, "learning_rate": 3.074e-05, "loss": 0.404, "step": 1926 }, { "epoch": 175.1939393939394, "grad_norm": 0.2625212073326111, "learning_rate": 3.0730000000000006e-05, "loss": 0.4086, "step": 1927 }, { "epoch": 175.29090909090908, "grad_norm": 0.24784494936466217, "learning_rate": 3.072e-05, "loss": 0.3628, "step": 1928 }, { "epoch": 175.38787878787878, "grad_norm": 0.2836151719093323, "learning_rate": 3.071e-05, "loss": 0.4296, "step": 1929 }, { "epoch": 175.4848484848485, "grad_norm": 0.376642644405365, "learning_rate": 3.07e-05, "loss": 0.4384, "step": 1930 }, { "epoch": 175.4848484848485, "eval_loss": 0.4586239159107208, "eval_runtime": 2.1663, "eval_samples_per_second": 25.389, "eval_steps_per_second": 3.231, "step": 1930 }, { "epoch": 175.5818181818182, "grad_norm": 0.2923555374145508, "learning_rate": 3.069e-05, "loss": 0.3755, "step": 1931 }, { "epoch": 175.6787878787879, "grad_norm": 0.33017534017562866, "learning_rate": 3.0680000000000004e-05, "loss": 0.3765, "step": 1932 }, { "epoch": 175.77575757575758, "grad_norm": 0.2759827673435211, "learning_rate": 3.0669999999999996e-05, "loss": 0.3779, "step": 1933 }, { "epoch": 175.87272727272727, "grad_norm": 0.25442424416542053, "learning_rate": 3.066e-05, "loss": 0.4056, "step": 1934 }, { "epoch": 175.96969696969697, "grad_norm": 0.3058272898197174, "learning_rate": 3.065e-05, "loss": 0.382, "step": 1935 }, { "epoch": 176.0, "grad_norm": 0.5333647727966309, "learning_rate": 3.0640000000000005e-05, "loss": 0.3437, "step": 1936 }, { "epoch": 176.0969696969697, "grad_norm": 0.2898612320423126, "learning_rate": 3.063e-05, "loss": 0.3814, "step": 1937 }, { "epoch": 176.1939393939394, "grad_norm": 0.2499009072780609, "learning_rate": 3.062e-05, "loss": 0.4063, "step": 1938 }, { "epoch": 176.29090909090908, "grad_norm": 0.30573129653930664, "learning_rate": 3.061e-05, "loss": 0.4219, "step": 1939 }, { "epoch": 176.38787878787878, "grad_norm": 0.31919580698013306, "learning_rate": 3.06e-05, "loss": 0.4216, "step": 1940 }, { "epoch": 176.38787878787878, "eval_loss": 0.45822402834892273, "eval_runtime": 2.1441, "eval_samples_per_second": 25.651, "eval_steps_per_second": 3.265, "step": 1940 }, { "epoch": 176.4848484848485, "grad_norm": 0.2452893853187561, "learning_rate": 3.0590000000000004e-05, "loss": 0.3846, "step": 1941 }, { "epoch": 176.5818181818182, "grad_norm": 0.23256346583366394, "learning_rate": 3.058e-05, "loss": 0.3503, "step": 1942 }, { "epoch": 176.6787878787879, "grad_norm": 0.298978716135025, "learning_rate": 3.057000000000001e-05, "loss": 0.3615, "step": 1943 }, { "epoch": 176.77575757575758, "grad_norm": 0.29125311970710754, "learning_rate": 3.056e-05, "loss": 0.391, "step": 1944 }, { "epoch": 176.87272727272727, "grad_norm": 0.3193817436695099, "learning_rate": 3.0550000000000004e-05, "loss": 0.4168, "step": 1945 }, { "epoch": 176.96969696969697, "grad_norm": 0.2996191382408142, "learning_rate": 3.054e-05, "loss": 0.3894, "step": 1946 }, { "epoch": 177.0, "grad_norm": 0.48652327060699463, "learning_rate": 3.053e-05, "loss": 0.4369, "step": 1947 }, { "epoch": 177.0969696969697, "grad_norm": 0.2905808985233307, "learning_rate": 3.0520000000000006e-05, "loss": 0.3961, "step": 1948 }, { "epoch": 177.1939393939394, "grad_norm": 0.3679002821445465, "learning_rate": 3.051e-05, "loss": 0.4492, "step": 1949 }, { "epoch": 177.29090909090908, "grad_norm": 0.2981612980365753, "learning_rate": 3.05e-05, "loss": 0.393, "step": 1950 }, { "epoch": 177.29090909090908, "eval_loss": 0.4579584300518036, "eval_runtime": 2.1224, "eval_samples_per_second": 25.914, "eval_steps_per_second": 3.298, "step": 1950 }, { "epoch": 177.38787878787878, "grad_norm": 0.3025275766849518, "learning_rate": 3.049e-05, "loss": 0.4055, "step": 1951 }, { "epoch": 177.4848484848485, "grad_norm": 0.26585206389427185, "learning_rate": 3.0480000000000003e-05, "loss": 0.3805, "step": 1952 }, { "epoch": 177.5818181818182, "grad_norm": 0.2868068516254425, "learning_rate": 3.0470000000000005e-05, "loss": 0.3709, "step": 1953 }, { "epoch": 177.6787878787879, "grad_norm": 0.28243038058280945, "learning_rate": 3.046e-05, "loss": 0.3904, "step": 1954 }, { "epoch": 177.77575757575758, "grad_norm": 0.32643869519233704, "learning_rate": 3.045e-05, "loss": 0.3602, "step": 1955 }, { "epoch": 177.87272727272727, "grad_norm": 0.36092421412467957, "learning_rate": 3.0440000000000003e-05, "loss": 0.3731, "step": 1956 }, { "epoch": 177.96969696969697, "grad_norm": 0.25088557600975037, "learning_rate": 3.0430000000000002e-05, "loss": 0.4032, "step": 1957 }, { "epoch": 178.0, "grad_norm": 0.4627428948879242, "learning_rate": 3.0420000000000004e-05, "loss": 0.4199, "step": 1958 }, { "epoch": 178.0969696969697, "grad_norm": 0.24005559086799622, "learning_rate": 3.041e-05, "loss": 0.3832, "step": 1959 }, { "epoch": 178.1939393939394, "grad_norm": 0.3331547677516937, "learning_rate": 3.04e-05, "loss": 0.4401, "step": 1960 }, { "epoch": 178.1939393939394, "eval_loss": 0.45817330479621887, "eval_runtime": 2.1406, "eval_samples_per_second": 25.693, "eval_steps_per_second": 3.27, "step": 1960 }, { "epoch": 178.29090909090908, "grad_norm": 0.3940174877643585, "learning_rate": 3.0390000000000002e-05, "loss": 0.3819, "step": 1961 }, { "epoch": 178.38787878787878, "grad_norm": 0.29317134618759155, "learning_rate": 3.0380000000000004e-05, "loss": 0.3818, "step": 1962 }, { "epoch": 178.4848484848485, "grad_norm": 0.2608102560043335, "learning_rate": 3.0370000000000006e-05, "loss": 0.4172, "step": 1963 }, { "epoch": 178.5818181818182, "grad_norm": 0.2783978283405304, "learning_rate": 3.036e-05, "loss": 0.3945, "step": 1964 }, { "epoch": 178.6787878787879, "grad_norm": 0.2786385715007782, "learning_rate": 3.035e-05, "loss": 0.4058, "step": 1965 }, { "epoch": 178.77575757575758, "grad_norm": 0.3624214828014374, "learning_rate": 3.034e-05, "loss": 0.38, "step": 1966 }, { "epoch": 178.87272727272727, "grad_norm": 0.23733411729335785, "learning_rate": 3.0330000000000003e-05, "loss": 0.3582, "step": 1967 }, { "epoch": 178.96969696969697, "grad_norm": 0.3076571822166443, "learning_rate": 3.0320000000000004e-05, "loss": 0.3797, "step": 1968 }, { "epoch": 179.0, "grad_norm": 0.47409960627555847, "learning_rate": 3.031e-05, "loss": 0.3919, "step": 1969 }, { "epoch": 179.0969696969697, "grad_norm": 0.2679012715816498, "learning_rate": 3.03e-05, "loss": 0.3471, "step": 1970 }, { "epoch": 179.0969696969697, "eval_loss": 0.4579102694988251, "eval_runtime": 2.151, "eval_samples_per_second": 25.57, "eval_steps_per_second": 3.254, "step": 1970 }, { "epoch": 179.1939393939394, "grad_norm": 0.37176313996315, "learning_rate": 3.0290000000000003e-05, "loss": 0.3834, "step": 1971 }, { "epoch": 179.29090909090908, "grad_norm": 0.36216312646865845, "learning_rate": 3.028e-05, "loss": 0.4412, "step": 1972 }, { "epoch": 179.38787878787878, "grad_norm": 0.36082252860069275, "learning_rate": 3.0270000000000003e-05, "loss": 0.381, "step": 1973 }, { "epoch": 179.4848484848485, "grad_norm": 0.3448978662490845, "learning_rate": 3.0259999999999998e-05, "loss": 0.3909, "step": 1974 }, { "epoch": 179.5818181818182, "grad_norm": 0.2553895115852356, "learning_rate": 3.025e-05, "loss": 0.3737, "step": 1975 }, { "epoch": 179.6787878787879, "grad_norm": 0.2908379137516022, "learning_rate": 3.0240000000000002e-05, "loss": 0.4023, "step": 1976 }, { "epoch": 179.77575757575758, "grad_norm": 0.2847512662410736, "learning_rate": 3.0230000000000004e-05, "loss": 0.3828, "step": 1977 }, { "epoch": 179.87272727272727, "grad_norm": 0.31389573216438293, "learning_rate": 3.0220000000000005e-05, "loss": 0.4101, "step": 1978 }, { "epoch": 179.96969696969697, "grad_norm": 0.27856436371803284, "learning_rate": 3.021e-05, "loss": 0.3984, "step": 1979 }, { "epoch": 180.0, "grad_norm": 0.6234988570213318, "learning_rate": 3.02e-05, "loss": 0.411, "step": 1980 }, { "epoch": 180.0, "eval_loss": 0.45648694038391113, "eval_runtime": 2.1436, "eval_samples_per_second": 25.657, "eval_steps_per_second": 3.265, "step": 1980 }, { "epoch": 180.0969696969697, "grad_norm": 0.2653573751449585, "learning_rate": 3.019e-05, "loss": 0.3949, "step": 1981 }, { "epoch": 180.1939393939394, "grad_norm": 0.38016894459724426, "learning_rate": 3.0180000000000002e-05, "loss": 0.3905, "step": 1982 }, { "epoch": 180.29090909090908, "grad_norm": 0.35457536578178406, "learning_rate": 3.0170000000000004e-05, "loss": 0.4022, "step": 1983 }, { "epoch": 180.38787878787878, "grad_norm": 0.2664441764354706, "learning_rate": 3.016e-05, "loss": 0.3634, "step": 1984 }, { "epoch": 180.4848484848485, "grad_norm": 0.37940555810928345, "learning_rate": 3.015e-05, "loss": 0.3936, "step": 1985 }, { "epoch": 180.5818181818182, "grad_norm": 0.27955812215805054, "learning_rate": 3.0140000000000003e-05, "loss": 0.3521, "step": 1986 }, { "epoch": 180.6787878787879, "grad_norm": 0.3075166344642639, "learning_rate": 3.013e-05, "loss": 0.3979, "step": 1987 }, { "epoch": 180.77575757575758, "grad_norm": 0.4995088279247284, "learning_rate": 3.0120000000000003e-05, "loss": 0.4299, "step": 1988 }, { "epoch": 180.87272727272727, "grad_norm": 0.30396339297294617, "learning_rate": 3.0109999999999998e-05, "loss": 0.4124, "step": 1989 }, { "epoch": 180.96969696969697, "grad_norm": 0.26562339067459106, "learning_rate": 3.01e-05, "loss": 0.3467, "step": 1990 }, { "epoch": 180.96969696969697, "eval_loss": 0.4557584822177887, "eval_runtime": 2.1466, "eval_samples_per_second": 25.622, "eval_steps_per_second": 3.261, "step": 1990 }, { "epoch": 181.0, "grad_norm": 0.4487966001033783, "learning_rate": 3.009e-05, "loss": 0.4811, "step": 1991 }, { "epoch": 181.0969696969697, "grad_norm": 0.3044120967388153, "learning_rate": 3.0080000000000003e-05, "loss": 0.386, "step": 1992 }, { "epoch": 181.1939393939394, "grad_norm": 0.3281792104244232, "learning_rate": 3.0070000000000005e-05, "loss": 0.409, "step": 1993 }, { "epoch": 181.29090909090908, "grad_norm": 0.33631032705307007, "learning_rate": 3.006e-05, "loss": 0.3704, "step": 1994 }, { "epoch": 181.38787878787878, "grad_norm": 0.2445061206817627, "learning_rate": 3.0050000000000002e-05, "loss": 0.3865, "step": 1995 }, { "epoch": 181.4848484848485, "grad_norm": 0.35595229268074036, "learning_rate": 3.004e-05, "loss": 0.3441, "step": 1996 }, { "epoch": 181.5818181818182, "grad_norm": 0.26783713698387146, "learning_rate": 3.0030000000000002e-05, "loss": 0.3794, "step": 1997 }, { "epoch": 181.6787878787879, "grad_norm": 0.2763480544090271, "learning_rate": 3.0020000000000004e-05, "loss": 0.3984, "step": 1998 }, { "epoch": 181.77575757575758, "grad_norm": 0.28086158633232117, "learning_rate": 3.001e-05, "loss": 0.406, "step": 1999 }, { "epoch": 181.87272727272727, "grad_norm": 0.24243846535682678, "learning_rate": 3e-05, "loss": 0.3838, "step": 2000 }, { "epoch": 181.87272727272727, "eval_loss": 0.45567676424980164, "eval_runtime": 2.1323, "eval_samples_per_second": 25.794, "eval_steps_per_second": 3.283, "step": 2000 }, { "epoch": 181.96969696969697, "grad_norm": 0.37673303484916687, "learning_rate": 2.9990000000000003e-05, "loss": 0.4429, "step": 2001 }, { "epoch": 182.0, "grad_norm": 0.39634788036346436, "learning_rate": 2.998e-05, "loss": 0.3811, "step": 2002 }, { "epoch": 182.0969696969697, "grad_norm": 0.2639450430870056, "learning_rate": 2.9970000000000003e-05, "loss": 0.3787, "step": 2003 }, { "epoch": 182.1939393939394, "grad_norm": 0.2618735730648041, "learning_rate": 2.9959999999999998e-05, "loss": 0.4243, "step": 2004 }, { "epoch": 182.29090909090908, "grad_norm": 0.2818409502506256, "learning_rate": 2.995e-05, "loss": 0.3799, "step": 2005 }, { "epoch": 182.38787878787878, "grad_norm": 0.2500140070915222, "learning_rate": 2.994e-05, "loss": 0.3458, "step": 2006 }, { "epoch": 182.4848484848485, "grad_norm": 0.28353533148765564, "learning_rate": 2.9930000000000003e-05, "loss": 0.4154, "step": 2007 }, { "epoch": 182.5818181818182, "grad_norm": 0.26767697930336, "learning_rate": 2.9920000000000005e-05, "loss": 0.4024, "step": 2008 }, { "epoch": 182.6787878787879, "grad_norm": 0.26572415232658386, "learning_rate": 2.991e-05, "loss": 0.3788, "step": 2009 }, { "epoch": 182.77575757575758, "grad_norm": 0.3166835308074951, "learning_rate": 2.9900000000000002e-05, "loss": 0.4086, "step": 2010 }, { "epoch": 182.77575757575758, "eval_loss": 0.45494434237480164, "eval_runtime": 2.1299, "eval_samples_per_second": 25.823, "eval_steps_per_second": 3.287, "step": 2010 }, { "epoch": 182.87272727272727, "grad_norm": 0.3935854434967041, "learning_rate": 2.989e-05, "loss": 0.3866, "step": 2011 }, { "epoch": 182.96969696969697, "grad_norm": 0.2760485112667084, "learning_rate": 2.9880000000000002e-05, "loss": 0.3652, "step": 2012 }, { "epoch": 183.0, "grad_norm": 0.4534105360507965, "learning_rate": 2.9870000000000004e-05, "loss": 0.4278, "step": 2013 }, { "epoch": 183.0969696969697, "grad_norm": 0.2943841814994812, "learning_rate": 2.986e-05, "loss": 0.375, "step": 2014 }, { "epoch": 183.1939393939394, "grad_norm": 0.3969006836414337, "learning_rate": 2.985e-05, "loss": 0.3654, "step": 2015 }, { "epoch": 183.29090909090908, "grad_norm": 0.3221188187599182, "learning_rate": 2.9840000000000002e-05, "loss": 0.3803, "step": 2016 }, { "epoch": 183.38787878787878, "grad_norm": 0.2897550165653229, "learning_rate": 2.9830000000000004e-05, "loss": 0.4101, "step": 2017 }, { "epoch": 183.4848484848485, "grad_norm": 0.256380170583725, "learning_rate": 2.9820000000000002e-05, "loss": 0.4126, "step": 2018 }, { "epoch": 183.5818181818182, "grad_norm": 0.28354132175445557, "learning_rate": 2.9809999999999997e-05, "loss": 0.3683, "step": 2019 }, { "epoch": 183.6787878787879, "grad_norm": 0.2842443287372589, "learning_rate": 2.98e-05, "loss": 0.405, "step": 2020 }, { "epoch": 183.6787878787879, "eval_loss": 0.454367995262146, "eval_runtime": 2.1417, "eval_samples_per_second": 25.68, "eval_steps_per_second": 3.268, "step": 2020 }, { "epoch": 183.77575757575758, "grad_norm": 0.29224082827568054, "learning_rate": 2.979e-05, "loss": 0.4091, "step": 2021 }, { "epoch": 183.87272727272727, "grad_norm": 0.2452160269021988, "learning_rate": 2.9780000000000003e-05, "loss": 0.3958, "step": 2022 }, { "epoch": 183.96969696969697, "grad_norm": 0.2455267459154129, "learning_rate": 2.9770000000000005e-05, "loss": 0.3598, "step": 2023 }, { "epoch": 184.0, "grad_norm": 0.5353454351425171, "learning_rate": 2.976e-05, "loss": 0.4148, "step": 2024 }, { "epoch": 184.0969696969697, "grad_norm": 0.23789070546627045, "learning_rate": 2.975e-05, "loss": 0.3749, "step": 2025 }, { "epoch": 184.1939393939394, "grad_norm": 0.27197128534317017, "learning_rate": 2.974e-05, "loss": 0.3735, "step": 2026 }, { "epoch": 184.29090909090908, "grad_norm": 0.32617834210395813, "learning_rate": 2.973e-05, "loss": 0.4064, "step": 2027 }, { "epoch": 184.38787878787878, "grad_norm": 0.33649569749832153, "learning_rate": 2.9720000000000003e-05, "loss": 0.3779, "step": 2028 }, { "epoch": 184.4848484848485, "grad_norm": 0.3602868914604187, "learning_rate": 2.971e-05, "loss": 0.3829, "step": 2029 }, { "epoch": 184.5818181818182, "grad_norm": 0.2936662435531616, "learning_rate": 2.97e-05, "loss": 0.3844, "step": 2030 }, { "epoch": 184.5818181818182, "eval_loss": 0.45421287417411804, "eval_runtime": 2.1497, "eval_samples_per_second": 25.584, "eval_steps_per_second": 3.256, "step": 2030 }, { "epoch": 184.6787878787879, "grad_norm": 0.2704029977321625, "learning_rate": 2.9690000000000002e-05, "loss": 0.3553, "step": 2031 }, { "epoch": 184.77575757575758, "grad_norm": 0.2969801723957062, "learning_rate": 2.9680000000000004e-05, "loss": 0.4152, "step": 2032 }, { "epoch": 184.87272727272727, "grad_norm": 0.36191806197166443, "learning_rate": 2.9670000000000002e-05, "loss": 0.4175, "step": 2033 }, { "epoch": 184.96969696969697, "grad_norm": 0.32152867317199707, "learning_rate": 2.9659999999999997e-05, "loss": 0.387, "step": 2034 }, { "epoch": 185.0, "grad_norm": 0.42188048362731934, "learning_rate": 2.965e-05, "loss": 0.4182, "step": 2035 }, { "epoch": 185.0969696969697, "grad_norm": 0.28894734382629395, "learning_rate": 2.964e-05, "loss": 0.4227, "step": 2036 }, { "epoch": 185.1939393939394, "grad_norm": 0.3921942412853241, "learning_rate": 2.9630000000000003e-05, "loss": 0.3479, "step": 2037 }, { "epoch": 185.29090909090908, "grad_norm": 0.31626754999160767, "learning_rate": 2.9620000000000004e-05, "loss": 0.3833, "step": 2038 }, { "epoch": 185.38787878787878, "grad_norm": 0.2705192565917969, "learning_rate": 2.961e-05, "loss": 0.3865, "step": 2039 }, { "epoch": 185.4848484848485, "grad_norm": 0.2821842133998871, "learning_rate": 2.96e-05, "loss": 0.3941, "step": 2040 }, { "epoch": 185.4848484848485, "eval_loss": 0.45374125242233276, "eval_runtime": 2.1472, "eval_samples_per_second": 25.615, "eval_steps_per_second": 3.26, "step": 2040 }, { "epoch": 185.5818181818182, "grad_norm": 0.26259809732437134, "learning_rate": 2.959e-05, "loss": 0.4028, "step": 2041 }, { "epoch": 185.6787878787879, "grad_norm": 0.3034418225288391, "learning_rate": 2.958e-05, "loss": 0.3733, "step": 2042 }, { "epoch": 185.77575757575758, "grad_norm": 0.2680684030056, "learning_rate": 2.9570000000000003e-05, "loss": 0.382, "step": 2043 }, { "epoch": 185.87272727272727, "grad_norm": 0.224136620759964, "learning_rate": 2.9559999999999998e-05, "loss": 0.348, "step": 2044 }, { "epoch": 185.96969696969697, "grad_norm": 0.28951144218444824, "learning_rate": 2.955e-05, "loss": 0.4173, "step": 2045 }, { "epoch": 186.0, "grad_norm": 0.6578705906867981, "learning_rate": 2.9540000000000002e-05, "loss": 0.4462, "step": 2046 }, { "epoch": 186.0969696969697, "grad_norm": 0.37858498096466064, "learning_rate": 2.9530000000000004e-05, "loss": 0.3548, "step": 2047 }, { "epoch": 186.1939393939394, "grad_norm": 0.25204986333847046, "learning_rate": 2.9520000000000002e-05, "loss": 0.4039, "step": 2048 }, { "epoch": 186.29090909090908, "grad_norm": 0.36383822560310364, "learning_rate": 2.951e-05, "loss": 0.3992, "step": 2049 }, { "epoch": 186.38787878787878, "grad_norm": 0.2452254593372345, "learning_rate": 2.95e-05, "loss": 0.409, "step": 2050 }, { "epoch": 186.38787878787878, "eval_loss": 0.45269790291786194, "eval_runtime": 2.1403, "eval_samples_per_second": 25.698, "eval_steps_per_second": 3.271, "step": 2050 }, { "epoch": 186.4848484848485, "grad_norm": 0.24856916069984436, "learning_rate": 2.949e-05, "loss": 0.3842, "step": 2051 }, { "epoch": 186.5818181818182, "grad_norm": 0.27388259768486023, "learning_rate": 2.9480000000000002e-05, "loss": 0.3932, "step": 2052 }, { "epoch": 186.6787878787879, "grad_norm": 0.24813078343868256, "learning_rate": 2.9470000000000004e-05, "loss": 0.3902, "step": 2053 }, { "epoch": 186.77575757575758, "grad_norm": 0.2655707895755768, "learning_rate": 2.946e-05, "loss": 0.3691, "step": 2054 }, { "epoch": 186.87272727272727, "grad_norm": 0.3013978600502014, "learning_rate": 2.945e-05, "loss": 0.3864, "step": 2055 }, { "epoch": 186.96969696969697, "grad_norm": 0.32454273104667664, "learning_rate": 2.944e-05, "loss": 0.3617, "step": 2056 }, { "epoch": 187.0, "grad_norm": 0.5444182753562927, "learning_rate": 2.943e-05, "loss": 0.4437, "step": 2057 }, { "epoch": 187.0969696969697, "grad_norm": 0.2726549804210663, "learning_rate": 2.9420000000000003e-05, "loss": 0.4053, "step": 2058 }, { "epoch": 187.1939393939394, "grad_norm": 0.2990370988845825, "learning_rate": 2.9409999999999998e-05, "loss": 0.3343, "step": 2059 }, { "epoch": 187.29090909090908, "grad_norm": 0.33290451765060425, "learning_rate": 2.94e-05, "loss": 0.4081, "step": 2060 }, { "epoch": 187.29090909090908, "eval_loss": 0.4533584713935852, "eval_runtime": 2.1628, "eval_samples_per_second": 25.43, "eval_steps_per_second": 3.237, "step": 2060 }, { "epoch": 187.38787878787878, "grad_norm": 0.313956081867218, "learning_rate": 2.939e-05, "loss": 0.3903, "step": 2061 }, { "epoch": 187.4848484848485, "grad_norm": 0.282910019159317, "learning_rate": 2.9380000000000003e-05, "loss": 0.4115, "step": 2062 }, { "epoch": 187.5818181818182, "grad_norm": 0.301270067691803, "learning_rate": 2.9370000000000002e-05, "loss": 0.3726, "step": 2063 }, { "epoch": 187.6787878787879, "grad_norm": 0.30559325218200684, "learning_rate": 2.9360000000000003e-05, "loss": 0.388, "step": 2064 }, { "epoch": 187.77575757575758, "grad_norm": 0.31332719326019287, "learning_rate": 2.935e-05, "loss": 0.3771, "step": 2065 }, { "epoch": 187.87272727272727, "grad_norm": 0.31585612893104553, "learning_rate": 2.934e-05, "loss": 0.3978, "step": 2066 }, { "epoch": 187.96969696969697, "grad_norm": 0.30112504959106445, "learning_rate": 2.9330000000000002e-05, "loss": 0.3806, "step": 2067 }, { "epoch": 188.0, "grad_norm": 0.43080729246139526, "learning_rate": 2.9320000000000004e-05, "loss": 0.3825, "step": 2068 }, { "epoch": 188.0969696969697, "grad_norm": 0.29388001561164856, "learning_rate": 2.9310000000000006e-05, "loss": 0.3544, "step": 2069 }, { "epoch": 188.1939393939394, "grad_norm": 0.3585991859436035, "learning_rate": 2.93e-05, "loss": 0.4224, "step": 2070 }, { "epoch": 188.1939393939394, "eval_loss": 0.45311594009399414, "eval_runtime": 2.1539, "eval_samples_per_second": 25.536, "eval_steps_per_second": 3.25, "step": 2070 }, { "epoch": 188.29090909090908, "grad_norm": 0.27885720133781433, "learning_rate": 2.929e-05, "loss": 0.3926, "step": 2071 }, { "epoch": 188.38787878787878, "grad_norm": 0.34820637106895447, "learning_rate": 2.928e-05, "loss": 0.401, "step": 2072 }, { "epoch": 188.4848484848485, "grad_norm": 0.3126968741416931, "learning_rate": 2.9270000000000003e-05, "loss": 0.3889, "step": 2073 }, { "epoch": 188.5818181818182, "grad_norm": 0.2805291414260864, "learning_rate": 2.9260000000000004e-05, "loss": 0.3369, "step": 2074 }, { "epoch": 188.6787878787879, "grad_norm": 0.26318931579589844, "learning_rate": 2.925e-05, "loss": 0.3957, "step": 2075 }, { "epoch": 188.77575757575758, "grad_norm": 0.2963278293609619, "learning_rate": 2.924e-05, "loss": 0.3928, "step": 2076 }, { "epoch": 188.87272727272727, "grad_norm": 0.2633761167526245, "learning_rate": 2.9230000000000003e-05, "loss": 0.3823, "step": 2077 }, { "epoch": 188.96969696969697, "grad_norm": 0.37855419516563416, "learning_rate": 2.922e-05, "loss": 0.3874, "step": 2078 }, { "epoch": 189.0, "grad_norm": 0.5898931622505188, "learning_rate": 2.9210000000000003e-05, "loss": 0.396, "step": 2079 }, { "epoch": 189.0969696969697, "grad_norm": 0.3189804255962372, "learning_rate": 2.9199999999999998e-05, "loss": 0.3979, "step": 2080 }, { "epoch": 189.0969696969697, "eval_loss": 0.4517754912376404, "eval_runtime": 2.1255, "eval_samples_per_second": 25.876, "eval_steps_per_second": 3.293, "step": 2080 }, { "epoch": 189.1939393939394, "grad_norm": 0.3477194309234619, "learning_rate": 2.919e-05, "loss": 0.4225, "step": 2081 }, { "epoch": 189.29090909090908, "grad_norm": 0.3043428957462311, "learning_rate": 2.9180000000000002e-05, "loss": 0.3858, "step": 2082 }, { "epoch": 189.38787878787878, "grad_norm": 0.30492085218429565, "learning_rate": 2.9170000000000004e-05, "loss": 0.4159, "step": 2083 }, { "epoch": 189.4848484848485, "grad_norm": 0.30474886298179626, "learning_rate": 2.9160000000000005e-05, "loss": 0.3593, "step": 2084 }, { "epoch": 189.5818181818182, "grad_norm": 0.28546661138534546, "learning_rate": 2.915e-05, "loss": 0.3774, "step": 2085 }, { "epoch": 189.6787878787879, "grad_norm": 0.2670576870441437, "learning_rate": 2.9140000000000002e-05, "loss": 0.361, "step": 2086 }, { "epoch": 189.77575757575758, "grad_norm": 0.25876742601394653, "learning_rate": 2.913e-05, "loss": 0.346, "step": 2087 }, { "epoch": 189.87272727272727, "grad_norm": 0.2909092605113983, "learning_rate": 2.9120000000000002e-05, "loss": 0.4011, "step": 2088 }, { "epoch": 189.96969696969697, "grad_norm": 0.3466051518917084, "learning_rate": 2.9110000000000004e-05, "loss": 0.3702, "step": 2089 }, { "epoch": 190.0, "grad_norm": 0.37901443243026733, "learning_rate": 2.91e-05, "loss": 0.4301, "step": 2090 }, { "epoch": 190.0, "eval_loss": 0.4515250325202942, "eval_runtime": 2.1422, "eval_samples_per_second": 25.674, "eval_steps_per_second": 3.268, "step": 2090 }, { "epoch": 190.0969696969697, "grad_norm": 0.30174320936203003, "learning_rate": 2.909e-05, "loss": 0.4002, "step": 2091 }, { "epoch": 190.1939393939394, "grad_norm": 0.299001544713974, "learning_rate": 2.9080000000000003e-05, "loss": 0.4014, "step": 2092 }, { "epoch": 190.29090909090908, "grad_norm": 0.2551984488964081, "learning_rate": 2.907e-05, "loss": 0.3411, "step": 2093 }, { "epoch": 190.38787878787878, "grad_norm": 0.28803738951683044, "learning_rate": 2.9060000000000003e-05, "loss": 0.3405, "step": 2094 }, { "epoch": 190.4848484848485, "grad_norm": 0.240384042263031, "learning_rate": 2.9049999999999998e-05, "loss": 0.3615, "step": 2095 }, { "epoch": 190.5818181818182, "grad_norm": 0.27672815322875977, "learning_rate": 2.904e-05, "loss": 0.3987, "step": 2096 }, { "epoch": 190.6787878787879, "grad_norm": 0.29123345017433167, "learning_rate": 2.903e-05, "loss": 0.3931, "step": 2097 }, { "epoch": 190.77575757575758, "grad_norm": 0.30591174960136414, "learning_rate": 2.9020000000000003e-05, "loss": 0.4157, "step": 2098 }, { "epoch": 190.87272727272727, "grad_norm": 0.2984998822212219, "learning_rate": 2.9010000000000005e-05, "loss": 0.3936, "step": 2099 }, { "epoch": 190.96969696969697, "grad_norm": 0.2587094008922577, "learning_rate": 2.9e-05, "loss": 0.3863, "step": 2100 }, { "epoch": 190.96969696969697, "eval_loss": 0.4510568380355835, "eval_runtime": 2.1397, "eval_samples_per_second": 25.704, "eval_steps_per_second": 3.271, "step": 2100 }, { "epoch": 191.0, "grad_norm": 0.41683897376060486, "learning_rate": 2.8990000000000002e-05, "loss": 0.4114, "step": 2101 }, { "epoch": 191.0969696969697, "grad_norm": 0.27058327198028564, "learning_rate": 2.898e-05, "loss": 0.38, "step": 2102 }, { "epoch": 191.1939393939394, "grad_norm": 0.24893786013126373, "learning_rate": 2.8970000000000002e-05, "loss": 0.4132, "step": 2103 }, { "epoch": 191.29090909090908, "grad_norm": 0.3115560710430145, "learning_rate": 2.8960000000000004e-05, "loss": 0.3814, "step": 2104 }, { "epoch": 191.38787878787878, "grad_norm": 0.2888801395893097, "learning_rate": 2.895e-05, "loss": 0.4135, "step": 2105 }, { "epoch": 191.4848484848485, "grad_norm": 0.306196391582489, "learning_rate": 2.894e-05, "loss": 0.3731, "step": 2106 }, { "epoch": 191.5818181818182, "grad_norm": 0.2939477860927582, "learning_rate": 2.8930000000000003e-05, "loss": 0.3631, "step": 2107 }, { "epoch": 191.6787878787879, "grad_norm": 0.26490044593811035, "learning_rate": 2.8920000000000004e-05, "loss": 0.3973, "step": 2108 }, { "epoch": 191.77575757575758, "grad_norm": 0.24801471829414368, "learning_rate": 2.8910000000000003e-05, "loss": 0.3534, "step": 2109 }, { "epoch": 191.87272727272727, "grad_norm": 0.29833361506462097, "learning_rate": 2.8899999999999998e-05, "loss": 0.3721, "step": 2110 }, { "epoch": 191.87272727272727, "eval_loss": 0.4510841369628906, "eval_runtime": 2.1402, "eval_samples_per_second": 25.699, "eval_steps_per_second": 3.271, "step": 2110 }, { "epoch": 191.96969696969697, "grad_norm": 0.3207988142967224, "learning_rate": 2.889e-05, "loss": 0.3783, "step": 2111 }, { "epoch": 192.0, "grad_norm": 0.46868717670440674, "learning_rate": 2.888e-05, "loss": 0.4238, "step": 2112 }, { "epoch": 192.0969696969697, "grad_norm": 0.24790030717849731, "learning_rate": 2.8870000000000003e-05, "loss": 0.3819, "step": 2113 }, { "epoch": 192.1939393939394, "grad_norm": 0.36050891876220703, "learning_rate": 2.8860000000000005e-05, "loss": 0.3866, "step": 2114 }, { "epoch": 192.29090909090908, "grad_norm": 0.29458919167518616, "learning_rate": 2.885e-05, "loss": 0.3591, "step": 2115 }, { "epoch": 192.38787878787878, "grad_norm": 0.2597523331642151, "learning_rate": 2.8840000000000002e-05, "loss": 0.3779, "step": 2116 }, { "epoch": 192.4848484848485, "grad_norm": 0.26730599999427795, "learning_rate": 2.883e-05, "loss": 0.4044, "step": 2117 }, { "epoch": 192.5818181818182, "grad_norm": 0.3013802766799927, "learning_rate": 2.8820000000000002e-05, "loss": 0.3506, "step": 2118 }, { "epoch": 192.6787878787879, "grad_norm": 0.30980682373046875, "learning_rate": 2.8810000000000004e-05, "loss": 0.3812, "step": 2119 }, { "epoch": 192.77575757575758, "grad_norm": 0.284757137298584, "learning_rate": 2.88e-05, "loss": 0.3879, "step": 2120 }, { "epoch": 192.77575757575758, "eval_loss": 0.45064395666122437, "eval_runtime": 2.1303, "eval_samples_per_second": 25.817, "eval_steps_per_second": 3.286, "step": 2120 }, { "epoch": 192.87272727272727, "grad_norm": 0.31305962800979614, "learning_rate": 2.879e-05, "loss": 0.41, "step": 2121 }, { "epoch": 192.96969696969697, "grad_norm": 0.3607388734817505, "learning_rate": 2.8780000000000002e-05, "loss": 0.3922, "step": 2122 }, { "epoch": 193.0, "grad_norm": 0.5440636277198792, "learning_rate": 2.8770000000000004e-05, "loss": 0.3857, "step": 2123 }, { "epoch": 193.0969696969697, "grad_norm": 0.25659671425819397, "learning_rate": 2.8760000000000002e-05, "loss": 0.3733, "step": 2124 }, { "epoch": 193.1939393939394, "grad_norm": 0.2667579650878906, "learning_rate": 2.8749999999999997e-05, "loss": 0.3897, "step": 2125 }, { "epoch": 193.29090909090908, "grad_norm": 0.3180757761001587, "learning_rate": 2.874e-05, "loss": 0.4107, "step": 2126 }, { "epoch": 193.38787878787878, "grad_norm": 0.2646811902523041, "learning_rate": 2.873e-05, "loss": 0.353, "step": 2127 }, { "epoch": 193.4848484848485, "grad_norm": 0.27166447043418884, "learning_rate": 2.8720000000000003e-05, "loss": 0.3432, "step": 2128 }, { "epoch": 193.5818181818182, "grad_norm": 0.3005528151988983, "learning_rate": 2.8710000000000005e-05, "loss": 0.4227, "step": 2129 }, { "epoch": 193.6787878787879, "grad_norm": 0.29820331931114197, "learning_rate": 2.87e-05, "loss": 0.3951, "step": 2130 }, { "epoch": 193.6787878787879, "eval_loss": 0.45010998845100403, "eval_runtime": 2.1348, "eval_samples_per_second": 25.764, "eval_steps_per_second": 3.279, "step": 2130 }, { "epoch": 193.77575757575758, "grad_norm": 0.28893670439720154, "learning_rate": 2.869e-05, "loss": 0.3715, "step": 2131 }, { "epoch": 193.87272727272727, "grad_norm": 0.30619752407073975, "learning_rate": 2.868e-05, "loss": 0.4044, "step": 2132 }, { "epoch": 193.96969696969697, "grad_norm": 0.3117333650588989, "learning_rate": 2.867e-05, "loss": 0.3557, "step": 2133 }, { "epoch": 194.0, "grad_norm": 0.5674205422401428, "learning_rate": 2.8660000000000003e-05, "loss": 0.4006, "step": 2134 }, { "epoch": 194.0969696969697, "grad_norm": 0.24678772687911987, "learning_rate": 2.865e-05, "loss": 0.4015, "step": 2135 }, { "epoch": 194.1939393939394, "grad_norm": 0.30944305658340454, "learning_rate": 2.864e-05, "loss": 0.3721, "step": 2136 }, { "epoch": 194.29090909090908, "grad_norm": 0.301979124546051, "learning_rate": 2.8630000000000002e-05, "loss": 0.3505, "step": 2137 }, { "epoch": 194.38787878787878, "grad_norm": 0.23500828444957733, "learning_rate": 2.8620000000000004e-05, "loss": 0.36, "step": 2138 }, { "epoch": 194.4848484848485, "grad_norm": 0.27564796805381775, "learning_rate": 2.8610000000000002e-05, "loss": 0.3744, "step": 2139 }, { "epoch": 194.5818181818182, "grad_norm": 0.28560516238212585, "learning_rate": 2.86e-05, "loss": 0.3813, "step": 2140 }, { "epoch": 194.5818181818182, "eval_loss": 0.44997575879096985, "eval_runtime": 2.1293, "eval_samples_per_second": 25.83, "eval_steps_per_second": 3.287, "step": 2140 }, { "epoch": 194.6787878787879, "grad_norm": 0.3202413022518158, "learning_rate": 2.859e-05, "loss": 0.4007, "step": 2141 }, { "epoch": 194.77575757575758, "grad_norm": 0.2470165342092514, "learning_rate": 2.858e-05, "loss": 0.3691, "step": 2142 }, { "epoch": 194.87272727272727, "grad_norm": 0.3099730312824249, "learning_rate": 2.8570000000000003e-05, "loss": 0.3977, "step": 2143 }, { "epoch": 194.96969696969697, "grad_norm": 0.33016228675842285, "learning_rate": 2.8560000000000004e-05, "loss": 0.409, "step": 2144 }, { "epoch": 195.0, "grad_norm": 0.5346285104751587, "learning_rate": 2.855e-05, "loss": 0.383, "step": 2145 }, { "epoch": 195.0969696969697, "grad_norm": 0.3117358088493347, "learning_rate": 2.854e-05, "loss": 0.4108, "step": 2146 }, { "epoch": 195.1939393939394, "grad_norm": 0.3187137246131897, "learning_rate": 2.853e-05, "loss": 0.3952, "step": 2147 }, { "epoch": 195.29090909090908, "grad_norm": 0.25608211755752563, "learning_rate": 2.852e-05, "loss": 0.3825, "step": 2148 }, { "epoch": 195.38787878787878, "grad_norm": 0.2572425901889801, "learning_rate": 2.8510000000000003e-05, "loss": 0.3897, "step": 2149 }, { "epoch": 195.4848484848485, "grad_norm": 0.24927036464214325, "learning_rate": 2.8499999999999998e-05, "loss": 0.3956, "step": 2150 }, { "epoch": 195.4848484848485, "eval_loss": 0.4496622681617737, "eval_runtime": 2.156, "eval_samples_per_second": 25.51, "eval_steps_per_second": 3.247, "step": 2150 }, { "epoch": 195.5818181818182, "grad_norm": 0.29352957010269165, "learning_rate": 2.849e-05, "loss": 0.3445, "step": 2151 }, { "epoch": 195.6787878787879, "grad_norm": 0.2819279730319977, "learning_rate": 2.8480000000000002e-05, "loss": 0.3872, "step": 2152 }, { "epoch": 195.77575757575758, "grad_norm": 0.26512467861175537, "learning_rate": 2.8470000000000004e-05, "loss": 0.3793, "step": 2153 }, { "epoch": 195.87272727272727, "grad_norm": 0.29471302032470703, "learning_rate": 2.8460000000000002e-05, "loss": 0.3611, "step": 2154 }, { "epoch": 195.96969696969697, "grad_norm": 0.3353519141674042, "learning_rate": 2.845e-05, "loss": 0.3636, "step": 2155 }, { "epoch": 196.0, "grad_norm": 0.5895360708236694, "learning_rate": 2.844e-05, "loss": 0.3931, "step": 2156 }, { "epoch": 196.0969696969697, "grad_norm": 0.296806275844574, "learning_rate": 2.843e-05, "loss": 0.3933, "step": 2157 }, { "epoch": 196.1939393939394, "grad_norm": 0.27521470189094543, "learning_rate": 2.8420000000000002e-05, "loss": 0.3828, "step": 2158 }, { "epoch": 196.29090909090908, "grad_norm": 0.3265436887741089, "learning_rate": 2.8410000000000004e-05, "loss": 0.3615, "step": 2159 }, { "epoch": 196.38787878787878, "grad_norm": 0.32005420327186584, "learning_rate": 2.84e-05, "loss": 0.4026, "step": 2160 }, { "epoch": 196.38787878787878, "eval_loss": 0.4497809112071991, "eval_runtime": 2.1349, "eval_samples_per_second": 25.762, "eval_steps_per_second": 3.279, "step": 2160 }, { "epoch": 196.4848484848485, "grad_norm": 0.31602486968040466, "learning_rate": 2.839e-05, "loss": 0.3344, "step": 2161 }, { "epoch": 196.5818181818182, "grad_norm": 0.29071107506752014, "learning_rate": 2.8380000000000003e-05, "loss": 0.428, "step": 2162 }, { "epoch": 196.6787878787879, "grad_norm": 0.3036842942237854, "learning_rate": 2.837e-05, "loss": 0.4099, "step": 2163 }, { "epoch": 196.77575757575758, "grad_norm": 0.2522382140159607, "learning_rate": 2.8360000000000003e-05, "loss": 0.4009, "step": 2164 }, { "epoch": 196.87272727272727, "grad_norm": 0.2813572883605957, "learning_rate": 2.8349999999999998e-05, "loss": 0.3761, "step": 2165 }, { "epoch": 196.96969696969697, "grad_norm": 0.2621469795703888, "learning_rate": 2.834e-05, "loss": 0.3328, "step": 2166 }, { "epoch": 197.0, "grad_norm": 0.5489789843559265, "learning_rate": 2.833e-05, "loss": 0.3352, "step": 2167 }, { "epoch": 197.0969696969697, "grad_norm": 0.24636216461658478, "learning_rate": 2.8320000000000003e-05, "loss": 0.3706, "step": 2168 }, { "epoch": 197.1939393939394, "grad_norm": 0.2669205665588379, "learning_rate": 2.8310000000000002e-05, "loss": 0.3886, "step": 2169 }, { "epoch": 197.29090909090908, "grad_norm": 0.36443209648132324, "learning_rate": 2.83e-05, "loss": 0.3641, "step": 2170 }, { "epoch": 197.29090909090908, "eval_loss": 0.44957584142684937, "eval_runtime": 2.1169, "eval_samples_per_second": 25.981, "eval_steps_per_second": 3.307, "step": 2170 }, { "epoch": 197.38787878787878, "grad_norm": 0.3616332411766052, "learning_rate": 2.829e-05, "loss": 0.3687, "step": 2171 }, { "epoch": 197.4848484848485, "grad_norm": 0.29667091369628906, "learning_rate": 2.828e-05, "loss": 0.3602, "step": 2172 }, { "epoch": 197.5818181818182, "grad_norm": 0.34419506788253784, "learning_rate": 2.8270000000000002e-05, "loss": 0.3893, "step": 2173 }, { "epoch": 197.6787878787879, "grad_norm": 0.2823171019554138, "learning_rate": 2.8260000000000004e-05, "loss": 0.3839, "step": 2174 }, { "epoch": 197.77575757575758, "grad_norm": 0.33764100074768066, "learning_rate": 2.825e-05, "loss": 0.4157, "step": 2175 }, { "epoch": 197.87272727272727, "grad_norm": 0.30144527554512024, "learning_rate": 2.824e-05, "loss": 0.4137, "step": 2176 }, { "epoch": 197.96969696969697, "grad_norm": 0.30979812145233154, "learning_rate": 2.8230000000000002e-05, "loss": 0.3674, "step": 2177 }, { "epoch": 198.0, "grad_norm": 0.43826353549957275, "learning_rate": 2.822e-05, "loss": 0.3122, "step": 2178 }, { "epoch": 198.0969696969697, "grad_norm": 0.2648268938064575, "learning_rate": 2.8210000000000003e-05, "loss": 0.3007, "step": 2179 }, { "epoch": 198.1939393939394, "grad_norm": 0.2786037027835846, "learning_rate": 2.8199999999999998e-05, "loss": 0.3952, "step": 2180 }, { "epoch": 198.1939393939394, "eval_loss": 0.448219358921051, "eval_runtime": 2.1609, "eval_samples_per_second": 25.452, "eval_steps_per_second": 3.239, "step": 2180 }, { "epoch": 198.29090909090908, "grad_norm": 0.31549665331840515, "learning_rate": 2.819e-05, "loss": 0.3769, "step": 2181 }, { "epoch": 198.38787878787878, "grad_norm": 0.3384656310081482, "learning_rate": 2.818e-05, "loss": 0.4068, "step": 2182 }, { "epoch": 198.4848484848485, "grad_norm": 0.33379441499710083, "learning_rate": 2.8170000000000003e-05, "loss": 0.4166, "step": 2183 }, { "epoch": 198.5818181818182, "grad_norm": 0.4237292408943176, "learning_rate": 2.816e-05, "loss": 0.3673, "step": 2184 }, { "epoch": 198.6787878787879, "grad_norm": 0.306974858045578, "learning_rate": 2.815e-05, "loss": 0.3976, "step": 2185 }, { "epoch": 198.77575757575758, "grad_norm": 0.28793197870254517, "learning_rate": 2.8139999999999998e-05, "loss": 0.3793, "step": 2186 }, { "epoch": 198.87272727272727, "grad_norm": 0.28818053007125854, "learning_rate": 2.813e-05, "loss": 0.3847, "step": 2187 }, { "epoch": 198.96969696969697, "grad_norm": 0.34341973066329956, "learning_rate": 2.8120000000000002e-05, "loss": 0.3795, "step": 2188 }, { "epoch": 199.0, "grad_norm": 0.46411457657814026, "learning_rate": 2.8110000000000004e-05, "loss": 0.3738, "step": 2189 }, { "epoch": 199.0969696969697, "grad_norm": 0.32759159803390503, "learning_rate": 2.8100000000000005e-05, "loss": 0.39, "step": 2190 }, { "epoch": 199.0969696969697, "eval_loss": 0.4479880928993225, "eval_runtime": 2.1443, "eval_samples_per_second": 25.649, "eval_steps_per_second": 3.264, "step": 2190 }, { "epoch": 199.1939393939394, "grad_norm": 0.2490648478269577, "learning_rate": 2.809e-05, "loss": 0.3804, "step": 2191 }, { "epoch": 199.29090909090908, "grad_norm": 0.3017059564590454, "learning_rate": 2.8080000000000002e-05, "loss": 0.3681, "step": 2192 }, { "epoch": 199.38787878787878, "grad_norm": 0.3729502856731415, "learning_rate": 2.807e-05, "loss": 0.3944, "step": 2193 }, { "epoch": 199.4848484848485, "grad_norm": 0.32608261704444885, "learning_rate": 2.8060000000000002e-05, "loss": 0.3895, "step": 2194 }, { "epoch": 199.5818181818182, "grad_norm": 0.23186321556568146, "learning_rate": 2.8050000000000004e-05, "loss": 0.3592, "step": 2195 }, { "epoch": 199.6787878787879, "grad_norm": 0.3033948540687561, "learning_rate": 2.804e-05, "loss": 0.396, "step": 2196 }, { "epoch": 199.77575757575758, "grad_norm": 0.30141010880470276, "learning_rate": 2.803e-05, "loss": 0.3821, "step": 2197 }, { "epoch": 199.87272727272727, "grad_norm": 0.3135485351085663, "learning_rate": 2.8020000000000003e-05, "loss": 0.3644, "step": 2198 }, { "epoch": 199.96969696969697, "grad_norm": 0.27928251028060913, "learning_rate": 2.8010000000000005e-05, "loss": 0.3543, "step": 2199 }, { "epoch": 200.0, "grad_norm": 0.4405915439128876, "learning_rate": 2.8000000000000003e-05, "loss": 0.419, "step": 2200 }, { "epoch": 200.0, "eval_loss": 0.4478053152561188, "eval_runtime": 2.1288, "eval_samples_per_second": 25.836, "eval_steps_per_second": 3.288, "step": 2200 }, { "epoch": 200.0969696969697, "grad_norm": 0.25227150321006775, "learning_rate": 2.7989999999999998e-05, "loss": 0.3807, "step": 2201 }, { "epoch": 200.1939393939394, "grad_norm": 0.27379876375198364, "learning_rate": 2.798e-05, "loss": 0.3747, "step": 2202 }, { "epoch": 200.29090909090908, "grad_norm": 0.3297831118106842, "learning_rate": 2.797e-05, "loss": 0.3817, "step": 2203 }, { "epoch": 200.38787878787878, "grad_norm": 0.3016502261161804, "learning_rate": 2.7960000000000003e-05, "loss": 0.3829, "step": 2204 }, { "epoch": 200.4848484848485, "grad_norm": 0.3145178258419037, "learning_rate": 2.7950000000000005e-05, "loss": 0.3586, "step": 2205 }, { "epoch": 200.5818181818182, "grad_norm": 0.3010222017765045, "learning_rate": 2.794e-05, "loss": 0.3441, "step": 2206 }, { "epoch": 200.6787878787879, "grad_norm": 0.27790072560310364, "learning_rate": 2.7930000000000002e-05, "loss": 0.3872, "step": 2207 }, { "epoch": 200.77575757575758, "grad_norm": 0.306438148021698, "learning_rate": 2.792e-05, "loss": 0.402, "step": 2208 }, { "epoch": 200.87272727272727, "grad_norm": 0.3268650472164154, "learning_rate": 2.7910000000000002e-05, "loss": 0.4085, "step": 2209 }, { "epoch": 200.96969696969697, "grad_norm": 0.28407397866249084, "learning_rate": 2.7900000000000004e-05, "loss": 0.3667, "step": 2210 }, { "epoch": 200.96969696969697, "eval_loss": 0.44716876745224, "eval_runtime": 2.1402, "eval_samples_per_second": 25.698, "eval_steps_per_second": 3.271, "step": 2210 }, { "epoch": 201.0, "grad_norm": 0.537050724029541, "learning_rate": 2.789e-05, "loss": 0.3747, "step": 2211 }, { "epoch": 201.0969696969697, "grad_norm": 0.34605276584625244, "learning_rate": 2.788e-05, "loss": 0.3457, "step": 2212 }, { "epoch": 201.1939393939394, "grad_norm": 0.3041094243526459, "learning_rate": 2.7870000000000003e-05, "loss": 0.3857, "step": 2213 }, { "epoch": 201.29090909090908, "grad_norm": 0.2671644389629364, "learning_rate": 2.7860000000000004e-05, "loss": 0.4177, "step": 2214 }, { "epoch": 201.38787878787878, "grad_norm": 0.29513826966285706, "learning_rate": 2.7850000000000003e-05, "loss": 0.4293, "step": 2215 }, { "epoch": 201.4848484848485, "grad_norm": 0.34477704763412476, "learning_rate": 2.7839999999999998e-05, "loss": 0.3838, "step": 2216 }, { "epoch": 201.5818181818182, "grad_norm": 0.2400050014257431, "learning_rate": 2.783e-05, "loss": 0.3408, "step": 2217 }, { "epoch": 201.6787878787879, "grad_norm": 0.25592660903930664, "learning_rate": 2.782e-05, "loss": 0.3827, "step": 2218 }, { "epoch": 201.77575757575758, "grad_norm": 0.3099137842655182, "learning_rate": 2.7810000000000003e-05, "loss": 0.3614, "step": 2219 }, { "epoch": 201.87272727272727, "grad_norm": 0.3049536645412445, "learning_rate": 2.7800000000000005e-05, "loss": 0.3757, "step": 2220 }, { "epoch": 201.87272727272727, "eval_loss": 0.4471142888069153, "eval_runtime": 2.1643, "eval_samples_per_second": 25.413, "eval_steps_per_second": 3.234, "step": 2220 }, { "epoch": 201.96969696969697, "grad_norm": 0.26360929012298584, "learning_rate": 2.779e-05, "loss": 0.3576, "step": 2221 }, { "epoch": 202.0, "grad_norm": 0.5084606409072876, "learning_rate": 2.778e-05, "loss": 0.3695, "step": 2222 }, { "epoch": 202.0969696969697, "grad_norm": 0.306815505027771, "learning_rate": 2.777e-05, "loss": 0.4109, "step": 2223 }, { "epoch": 202.1939393939394, "grad_norm": 0.28837597370147705, "learning_rate": 2.7760000000000002e-05, "loss": 0.3717, "step": 2224 }, { "epoch": 202.29090909090908, "grad_norm": 0.2985439896583557, "learning_rate": 2.7750000000000004e-05, "loss": 0.3379, "step": 2225 }, { "epoch": 202.38787878787878, "grad_norm": 0.34035661816596985, "learning_rate": 2.774e-05, "loss": 0.3933, "step": 2226 }, { "epoch": 202.4848484848485, "grad_norm": 0.31377413868904114, "learning_rate": 2.773e-05, "loss": 0.359, "step": 2227 }, { "epoch": 202.5818181818182, "grad_norm": 0.3064294457435608, "learning_rate": 2.7720000000000002e-05, "loss": 0.387, "step": 2228 }, { "epoch": 202.6787878787879, "grad_norm": 0.27735260128974915, "learning_rate": 2.7710000000000004e-05, "loss": 0.3856, "step": 2229 }, { "epoch": 202.77575757575758, "grad_norm": 0.2875250279903412, "learning_rate": 2.7700000000000002e-05, "loss": 0.3819, "step": 2230 }, { "epoch": 202.77575757575758, "eval_loss": 0.4466860294342041, "eval_runtime": 2.1516, "eval_samples_per_second": 25.563, "eval_steps_per_second": 3.253, "step": 2230 }, { "epoch": 202.87272727272727, "grad_norm": 0.26599356532096863, "learning_rate": 2.769e-05, "loss": 0.3431, "step": 2231 }, { "epoch": 202.96969696969697, "grad_norm": 0.27645865082740784, "learning_rate": 2.768e-05, "loss": 0.3793, "step": 2232 }, { "epoch": 203.0, "grad_norm": 0.6768845915794373, "learning_rate": 2.767e-05, "loss": 0.4634, "step": 2233 }, { "epoch": 203.0969696969697, "grad_norm": 0.2488522231578827, "learning_rate": 2.7660000000000003e-05, "loss": 0.3871, "step": 2234 }, { "epoch": 203.1939393939394, "grad_norm": 0.25670602917671204, "learning_rate": 2.7650000000000005e-05, "loss": 0.3764, "step": 2235 }, { "epoch": 203.29090909090908, "grad_norm": 0.2679460346698761, "learning_rate": 2.764e-05, "loss": 0.3713, "step": 2236 }, { "epoch": 203.38787878787878, "grad_norm": 0.2824065685272217, "learning_rate": 2.763e-05, "loss": 0.4196, "step": 2237 }, { "epoch": 203.4848484848485, "grad_norm": 0.3017754852771759, "learning_rate": 2.762e-05, "loss": 0.4152, "step": 2238 }, { "epoch": 203.5818181818182, "grad_norm": 0.31364500522613525, "learning_rate": 2.761e-05, "loss": 0.3406, "step": 2239 }, { "epoch": 203.6787878787879, "grad_norm": 0.3167931139469147, "learning_rate": 2.7600000000000003e-05, "loss": 0.365, "step": 2240 }, { "epoch": 203.6787878787879, "eval_loss": 0.44658520817756653, "eval_runtime": 2.1274, "eval_samples_per_second": 25.853, "eval_steps_per_second": 3.29, "step": 2240 }, { "epoch": 203.77575757575758, "grad_norm": 0.2750272750854492, "learning_rate": 2.759e-05, "loss": 0.3776, "step": 2241 }, { "epoch": 203.87272727272727, "grad_norm": 0.3103097081184387, "learning_rate": 2.758e-05, "loss": 0.3668, "step": 2242 }, { "epoch": 203.96969696969697, "grad_norm": 0.2923380732536316, "learning_rate": 2.7570000000000002e-05, "loss": 0.3329, "step": 2243 }, { "epoch": 204.0, "grad_norm": 0.4510895609855652, "learning_rate": 2.7560000000000004e-05, "loss": 0.427, "step": 2244 }, { "epoch": 204.0969696969697, "grad_norm": 0.3134826123714447, "learning_rate": 2.7550000000000002e-05, "loss": 0.3567, "step": 2245 }, { "epoch": 204.1939393939394, "grad_norm": 0.35793009400367737, "learning_rate": 2.754e-05, "loss": 0.4098, "step": 2246 }, { "epoch": 204.29090909090908, "grad_norm": 0.2968956530094147, "learning_rate": 2.753e-05, "loss": 0.363, "step": 2247 }, { "epoch": 204.38787878787878, "grad_norm": 0.28569701313972473, "learning_rate": 2.752e-05, "loss": 0.3713, "step": 2248 }, { "epoch": 204.4848484848485, "grad_norm": 0.3336655795574188, "learning_rate": 2.7510000000000003e-05, "loss": 0.3785, "step": 2249 }, { "epoch": 204.5818181818182, "grad_norm": 0.3033703565597534, "learning_rate": 2.7500000000000004e-05, "loss": 0.3699, "step": 2250 }, { "epoch": 204.5818181818182, "eval_loss": 0.44568753242492676, "eval_runtime": 2.1588, "eval_samples_per_second": 25.478, "eval_steps_per_second": 3.243, "step": 2250 }, { "epoch": 204.6787878787879, "grad_norm": 0.31997036933898926, "learning_rate": 2.749e-05, "loss": 0.3259, "step": 2251 }, { "epoch": 204.77575757575758, "grad_norm": 0.281178742647171, "learning_rate": 2.748e-05, "loss": 0.4069, "step": 2252 }, { "epoch": 204.87272727272727, "grad_norm": 0.3126077950000763, "learning_rate": 2.7470000000000003e-05, "loss": 0.3975, "step": 2253 }, { "epoch": 204.96969696969697, "grad_norm": 0.31442731618881226, "learning_rate": 2.746e-05, "loss": 0.3947, "step": 2254 }, { "epoch": 205.0, "grad_norm": 0.6187658309936523, "learning_rate": 2.7450000000000003e-05, "loss": 0.3549, "step": 2255 }, { "epoch": 205.0969696969697, "grad_norm": 0.32099851965904236, "learning_rate": 2.7439999999999998e-05, "loss": 0.3769, "step": 2256 }, { "epoch": 205.1939393939394, "grad_norm": 0.4058302342891693, "learning_rate": 2.743e-05, "loss": 0.3651, "step": 2257 }, { "epoch": 205.29090909090908, "grad_norm": 0.3068258464336395, "learning_rate": 2.7420000000000002e-05, "loss": 0.4019, "step": 2258 }, { "epoch": 205.38787878787878, "grad_norm": 0.5158912539482117, "learning_rate": 2.7410000000000004e-05, "loss": 0.4054, "step": 2259 }, { "epoch": 205.4848484848485, "grad_norm": 0.28603148460388184, "learning_rate": 2.7400000000000002e-05, "loss": 0.3759, "step": 2260 }, { "epoch": 205.4848484848485, "eval_loss": 0.44500473141670227, "eval_runtime": 2.155, "eval_samples_per_second": 25.522, "eval_steps_per_second": 3.248, "step": 2260 }, { "epoch": 205.5818181818182, "grad_norm": 0.27636051177978516, "learning_rate": 2.739e-05, "loss": 0.3705, "step": 2261 }, { "epoch": 205.6787878787879, "grad_norm": 0.31432798504829407, "learning_rate": 2.738e-05, "loss": 0.3597, "step": 2262 }, { "epoch": 205.77575757575758, "grad_norm": 0.3602944016456604, "learning_rate": 2.737e-05, "loss": 0.3889, "step": 2263 }, { "epoch": 205.87272727272727, "grad_norm": 0.38743069767951965, "learning_rate": 2.7360000000000002e-05, "loss": 0.3703, "step": 2264 }, { "epoch": 205.96969696969697, "grad_norm": 0.30791616439819336, "learning_rate": 2.7350000000000004e-05, "loss": 0.3417, "step": 2265 }, { "epoch": 206.0, "grad_norm": 0.46710509061813354, "learning_rate": 2.734e-05, "loss": 0.3919, "step": 2266 }, { "epoch": 206.0969696969697, "grad_norm": 0.23429720103740692, "learning_rate": 2.733e-05, "loss": 0.3503, "step": 2267 }, { "epoch": 206.1939393939394, "grad_norm": 0.2622058391571045, "learning_rate": 2.7320000000000003e-05, "loss": 0.3682, "step": 2268 }, { "epoch": 206.29090909090908, "grad_norm": 0.341849684715271, "learning_rate": 2.731e-05, "loss": 0.3715, "step": 2269 }, { "epoch": 206.38787878787878, "grad_norm": 0.31176722049713135, "learning_rate": 2.7300000000000003e-05, "loss": 0.4036, "step": 2270 }, { "epoch": 206.38787878787878, "eval_loss": 0.4462737441062927, "eval_runtime": 2.141, "eval_samples_per_second": 25.689, "eval_steps_per_second": 3.27, "step": 2270 }, { "epoch": 206.4848484848485, "grad_norm": 0.30896806716918945, "learning_rate": 2.7289999999999998e-05, "loss": 0.3979, "step": 2271 }, { "epoch": 206.5818181818182, "grad_norm": 0.35733941197395325, "learning_rate": 2.728e-05, "loss": 0.3668, "step": 2272 }, { "epoch": 206.6787878787879, "grad_norm": 0.3435685932636261, "learning_rate": 2.727e-05, "loss": 0.3879, "step": 2273 }, { "epoch": 206.77575757575758, "grad_norm": 0.2502520978450775, "learning_rate": 2.7260000000000003e-05, "loss": 0.3613, "step": 2274 }, { "epoch": 206.87272727272727, "grad_norm": 0.3206669092178345, "learning_rate": 2.725e-05, "loss": 0.4247, "step": 2275 }, { "epoch": 206.96969696969697, "grad_norm": 0.2648768126964569, "learning_rate": 2.724e-05, "loss": 0.3348, "step": 2276 }, { "epoch": 207.0, "grad_norm": 0.6231780052185059, "learning_rate": 2.723e-05, "loss": 0.3228, "step": 2277 }, { "epoch": 207.0969696969697, "grad_norm": 0.3182898163795471, "learning_rate": 2.722e-05, "loss": 0.352, "step": 2278 }, { "epoch": 207.1939393939394, "grad_norm": 0.2684725224971771, "learning_rate": 2.7210000000000002e-05, "loss": 0.3666, "step": 2279 }, { "epoch": 207.29090909090908, "grad_norm": 0.2992083728313446, "learning_rate": 2.7200000000000004e-05, "loss": 0.4296, "step": 2280 }, { "epoch": 207.29090909090908, "eval_loss": 0.44510698318481445, "eval_runtime": 2.1509, "eval_samples_per_second": 25.571, "eval_steps_per_second": 3.254, "step": 2280 }, { "epoch": 207.38787878787878, "grad_norm": 0.26230400800704956, "learning_rate": 2.719e-05, "loss": 0.3608, "step": 2281 }, { "epoch": 207.4848484848485, "grad_norm": 0.3085038363933563, "learning_rate": 2.718e-05, "loss": 0.3991, "step": 2282 }, { "epoch": 207.5818181818182, "grad_norm": 0.33652815222740173, "learning_rate": 2.7170000000000002e-05, "loss": 0.3472, "step": 2283 }, { "epoch": 207.6787878787879, "grad_norm": 0.3844667375087738, "learning_rate": 2.716e-05, "loss": 0.3846, "step": 2284 }, { "epoch": 207.77575757575758, "grad_norm": 0.29387590289115906, "learning_rate": 2.7150000000000003e-05, "loss": 0.3576, "step": 2285 }, { "epoch": 207.87272727272727, "grad_norm": 0.30108508467674255, "learning_rate": 2.7139999999999998e-05, "loss": 0.3944, "step": 2286 }, { "epoch": 207.96969696969697, "grad_norm": 0.30221885442733765, "learning_rate": 2.713e-05, "loss": 0.3693, "step": 2287 }, { "epoch": 208.0, "grad_norm": 0.44697195291519165, "learning_rate": 2.712e-05, "loss": 0.3378, "step": 2288 }, { "epoch": 208.0969696969697, "grad_norm": 0.25601184368133545, "learning_rate": 2.7110000000000003e-05, "loss": 0.3761, "step": 2289 }, { "epoch": 208.1939393939394, "grad_norm": 0.283724844455719, "learning_rate": 2.7100000000000005e-05, "loss": 0.3876, "step": 2290 }, { "epoch": 208.1939393939394, "eval_loss": 0.4447232186794281, "eval_runtime": 2.1305, "eval_samples_per_second": 25.815, "eval_steps_per_second": 3.286, "step": 2290 }, { "epoch": 208.29090909090908, "grad_norm": 0.3504049479961395, "learning_rate": 2.709e-05, "loss": 0.3737, "step": 2291 }, { "epoch": 208.38787878787878, "grad_norm": 0.2954561114311218, "learning_rate": 2.7079999999999998e-05, "loss": 0.3726, "step": 2292 }, { "epoch": 208.4848484848485, "grad_norm": 0.27079299092292786, "learning_rate": 2.707e-05, "loss": 0.4073, "step": 2293 }, { "epoch": 208.5818181818182, "grad_norm": 0.2923456132411957, "learning_rate": 2.7060000000000002e-05, "loss": 0.3635, "step": 2294 }, { "epoch": 208.6787878787879, "grad_norm": 0.28545117378234863, "learning_rate": 2.7050000000000004e-05, "loss": 0.3716, "step": 2295 }, { "epoch": 208.77575757575758, "grad_norm": 0.28535234928131104, "learning_rate": 2.704e-05, "loss": 0.3699, "step": 2296 }, { "epoch": 208.87272727272727, "grad_norm": 0.3026406764984131, "learning_rate": 2.703e-05, "loss": 0.3638, "step": 2297 }, { "epoch": 208.96969696969697, "grad_norm": 0.3343816101551056, "learning_rate": 2.7020000000000002e-05, "loss": 0.369, "step": 2298 }, { "epoch": 209.0, "grad_norm": 0.46312960982322693, "learning_rate": 2.701e-05, "loss": 0.3242, "step": 2299 }, { "epoch": 209.0969696969697, "grad_norm": 0.2601545453071594, "learning_rate": 2.7000000000000002e-05, "loss": 0.3709, "step": 2300 }, { "epoch": 209.0969696969697, "eval_loss": 0.4444558620452881, "eval_runtime": 2.1319, "eval_samples_per_second": 25.799, "eval_steps_per_second": 3.284, "step": 2300 }, { "epoch": 209.1939393939394, "grad_norm": 0.2880859076976776, "learning_rate": 2.6989999999999997e-05, "loss": 0.375, "step": 2301 }, { "epoch": 209.29090909090908, "grad_norm": 0.4282890558242798, "learning_rate": 2.698e-05, "loss": 0.3903, "step": 2302 }, { "epoch": 209.38787878787878, "grad_norm": 0.28130725026130676, "learning_rate": 2.697e-05, "loss": 0.3495, "step": 2303 }, { "epoch": 209.4848484848485, "grad_norm": 0.33825841546058655, "learning_rate": 2.6960000000000003e-05, "loss": 0.392, "step": 2304 }, { "epoch": 209.5818181818182, "grad_norm": 0.34332937002182007, "learning_rate": 2.6950000000000005e-05, "loss": 0.3506, "step": 2305 }, { "epoch": 209.6787878787879, "grad_norm": 0.2920333445072174, "learning_rate": 2.694e-05, "loss": 0.3493, "step": 2306 }, { "epoch": 209.77575757575758, "grad_norm": 0.29194843769073486, "learning_rate": 2.693e-05, "loss": 0.4088, "step": 2307 }, { "epoch": 209.87272727272727, "grad_norm": 0.2841552495956421, "learning_rate": 2.692e-05, "loss": 0.4067, "step": 2308 }, { "epoch": 209.96969696969697, "grad_norm": 0.3399902582168579, "learning_rate": 2.691e-05, "loss": 0.3592, "step": 2309 }, { "epoch": 210.0, "grad_norm": 0.4557587206363678, "learning_rate": 2.6900000000000003e-05, "loss": 0.3301, "step": 2310 }, { "epoch": 210.0, "eval_loss": 0.44406941533088684, "eval_runtime": 2.1725, "eval_samples_per_second": 25.317, "eval_steps_per_second": 3.222, "step": 2310 }, { "epoch": 210.0969696969697, "grad_norm": 0.29916703701019287, "learning_rate": 2.689e-05, "loss": 0.3669, "step": 2311 }, { "epoch": 210.1939393939394, "grad_norm": 0.3633650541305542, "learning_rate": 2.688e-05, "loss": 0.3665, "step": 2312 }, { "epoch": 210.29090909090908, "grad_norm": 0.29734817147254944, "learning_rate": 2.6870000000000002e-05, "loss": 0.3624, "step": 2313 }, { "epoch": 210.38787878787878, "grad_norm": 0.2979733347892761, "learning_rate": 2.686e-05, "loss": 0.3705, "step": 2314 }, { "epoch": 210.4848484848485, "grad_norm": 0.3146786689758301, "learning_rate": 2.6850000000000002e-05, "loss": 0.3792, "step": 2315 }, { "epoch": 210.5818181818182, "grad_norm": 0.2864646017551422, "learning_rate": 2.6840000000000004e-05, "loss": 0.4047, "step": 2316 }, { "epoch": 210.6787878787879, "grad_norm": 0.24151170253753662, "learning_rate": 2.683e-05, "loss": 0.3796, "step": 2317 }, { "epoch": 210.77575757575758, "grad_norm": 0.2813226580619812, "learning_rate": 2.682e-05, "loss": 0.3795, "step": 2318 }, { "epoch": 210.87272727272727, "grad_norm": 0.28772884607315063, "learning_rate": 2.6810000000000003e-05, "loss": 0.3447, "step": 2319 }, { "epoch": 210.96969696969697, "grad_norm": 0.33018478751182556, "learning_rate": 2.6800000000000004e-05, "loss": 0.357, "step": 2320 }, { "epoch": 210.96969696969697, "eval_loss": 0.44407859444618225, "eval_runtime": 2.16, "eval_samples_per_second": 25.463, "eval_steps_per_second": 3.241, "step": 2320 }, { "epoch": 211.0, "grad_norm": 0.6959975361824036, "learning_rate": 2.6790000000000003e-05, "loss": 0.4455, "step": 2321 }, { "epoch": 211.0969696969697, "grad_norm": 0.26903417706489563, "learning_rate": 2.678e-05, "loss": 0.3935, "step": 2322 }, { "epoch": 211.1939393939394, "grad_norm": 0.2566149830818176, "learning_rate": 2.677e-05, "loss": 0.4047, "step": 2323 }, { "epoch": 211.29090909090908, "grad_norm": 0.3153493106365204, "learning_rate": 2.676e-05, "loss": 0.3853, "step": 2324 }, { "epoch": 211.38787878787878, "grad_norm": 0.33052006363868713, "learning_rate": 2.6750000000000003e-05, "loss": 0.3401, "step": 2325 }, { "epoch": 211.4848484848485, "grad_norm": 0.28321343660354614, "learning_rate": 2.6740000000000005e-05, "loss": 0.3642, "step": 2326 }, { "epoch": 211.5818181818182, "grad_norm": 0.2920601963996887, "learning_rate": 2.673e-05, "loss": 0.3674, "step": 2327 }, { "epoch": 211.6787878787879, "grad_norm": 0.30511558055877686, "learning_rate": 2.672e-05, "loss": 0.3397, "step": 2328 }, { "epoch": 211.77575757575758, "grad_norm": 0.2932232916355133, "learning_rate": 2.671e-05, "loss": 0.372, "step": 2329 }, { "epoch": 211.87272727272727, "grad_norm": 0.26344335079193115, "learning_rate": 2.6700000000000002e-05, "loss": 0.3836, "step": 2330 }, { "epoch": 211.87272727272727, "eval_loss": 0.44368764758110046, "eval_runtime": 2.1428, "eval_samples_per_second": 25.667, "eval_steps_per_second": 3.267, "step": 2330 }, { "epoch": 211.96969696969697, "grad_norm": 0.302018940448761, "learning_rate": 2.6690000000000004e-05, "loss": 0.3491, "step": 2331 }, { "epoch": 212.0, "grad_norm": 0.5806761980056763, "learning_rate": 2.668e-05, "loss": 0.4508, "step": 2332 }, { "epoch": 212.0969696969697, "grad_norm": 0.2934873402118683, "learning_rate": 2.667e-05, "loss": 0.3778, "step": 2333 }, { "epoch": 212.1939393939394, "grad_norm": 0.30157631635665894, "learning_rate": 2.6660000000000002e-05, "loss": 0.3947, "step": 2334 }, { "epoch": 212.29090909090908, "grad_norm": 0.34195688366889954, "learning_rate": 2.6650000000000004e-05, "loss": 0.3918, "step": 2335 }, { "epoch": 212.38787878787878, "grad_norm": 0.2895645797252655, "learning_rate": 2.6640000000000002e-05, "loss": 0.3873, "step": 2336 }, { "epoch": 212.4848484848485, "grad_norm": 0.3008175492286682, "learning_rate": 2.663e-05, "loss": 0.3488, "step": 2337 }, { "epoch": 212.5818181818182, "grad_norm": 0.3135222792625427, "learning_rate": 2.662e-05, "loss": 0.3687, "step": 2338 }, { "epoch": 212.6787878787879, "grad_norm": 0.3346012234687805, "learning_rate": 2.661e-05, "loss": 0.3364, "step": 2339 }, { "epoch": 212.77575757575758, "grad_norm": 0.2940306067466736, "learning_rate": 2.6600000000000003e-05, "loss": 0.3738, "step": 2340 }, { "epoch": 212.77575757575758, "eval_loss": 0.44305384159088135, "eval_runtime": 2.1483, "eval_samples_per_second": 25.602, "eval_steps_per_second": 3.258, "step": 2340 }, { "epoch": 212.87272727272727, "grad_norm": 0.3285348117351532, "learning_rate": 2.6590000000000005e-05, "loss": 0.362, "step": 2341 }, { "epoch": 212.96969696969697, "grad_norm": 0.32656341791152954, "learning_rate": 2.658e-05, "loss": 0.3652, "step": 2342 }, { "epoch": 213.0, "grad_norm": 0.48844948410987854, "learning_rate": 2.657e-05, "loss": 0.4201, "step": 2343 }, { "epoch": 213.0969696969697, "grad_norm": 0.29274335503578186, "learning_rate": 2.6560000000000003e-05, "loss": 0.3844, "step": 2344 }, { "epoch": 213.1939393939394, "grad_norm": 0.31116437911987305, "learning_rate": 2.655e-05, "loss": 0.3413, "step": 2345 }, { "epoch": 213.29090909090908, "grad_norm": 0.30364781618118286, "learning_rate": 2.6540000000000003e-05, "loss": 0.3795, "step": 2346 }, { "epoch": 213.38787878787878, "grad_norm": 0.2625868320465088, "learning_rate": 2.653e-05, "loss": 0.38, "step": 2347 }, { "epoch": 213.4848484848485, "grad_norm": 0.2870539128780365, "learning_rate": 2.652e-05, "loss": 0.3586, "step": 2348 }, { "epoch": 213.5818181818182, "grad_norm": 0.3897850215435028, "learning_rate": 2.6510000000000002e-05, "loss": 0.3713, "step": 2349 }, { "epoch": 213.6787878787879, "grad_norm": 0.3048601746559143, "learning_rate": 2.6500000000000004e-05, "loss": 0.4012, "step": 2350 }, { "epoch": 213.6787878787879, "eval_loss": 0.4433639645576477, "eval_runtime": 2.1461, "eval_samples_per_second": 25.627, "eval_steps_per_second": 3.262, "step": 2350 }, { "epoch": 213.77575757575758, "grad_norm": 0.26275619864463806, "learning_rate": 2.6490000000000002e-05, "loss": 0.3418, "step": 2351 }, { "epoch": 213.87272727272727, "grad_norm": 0.36444348096847534, "learning_rate": 2.648e-05, "loss": 0.4155, "step": 2352 }, { "epoch": 213.96969696969697, "grad_norm": 0.2950473427772522, "learning_rate": 2.647e-05, "loss": 0.3565, "step": 2353 }, { "epoch": 214.0, "grad_norm": 0.6876822113990784, "learning_rate": 2.646e-05, "loss": 0.3346, "step": 2354 }, { "epoch": 214.0969696969697, "grad_norm": 0.2745935916900635, "learning_rate": 2.6450000000000003e-05, "loss": 0.3661, "step": 2355 }, { "epoch": 214.1939393939394, "grad_norm": 0.29313480854034424, "learning_rate": 2.6440000000000004e-05, "loss": 0.3596, "step": 2356 }, { "epoch": 214.29090909090908, "grad_norm": 0.27624714374542236, "learning_rate": 2.643e-05, "loss": 0.3754, "step": 2357 }, { "epoch": 214.38787878787878, "grad_norm": 0.3175128996372223, "learning_rate": 2.642e-05, "loss": 0.3611, "step": 2358 }, { "epoch": 214.4848484848485, "grad_norm": 0.2875557541847229, "learning_rate": 2.6410000000000003e-05, "loss": 0.3296, "step": 2359 }, { "epoch": 214.5818181818182, "grad_norm": 0.32324209809303284, "learning_rate": 2.64e-05, "loss": 0.3988, "step": 2360 }, { "epoch": 214.5818181818182, "eval_loss": 0.442745566368103, "eval_runtime": 2.1394, "eval_samples_per_second": 25.708, "eval_steps_per_second": 3.272, "step": 2360 }, { "epoch": 214.6787878787879, "grad_norm": 0.28028708696365356, "learning_rate": 2.6390000000000003e-05, "loss": 0.3799, "step": 2361 }, { "epoch": 214.77575757575758, "grad_norm": 0.2773652672767639, "learning_rate": 2.6379999999999998e-05, "loss": 0.3755, "step": 2362 }, { "epoch": 214.87272727272727, "grad_norm": 0.3084127902984619, "learning_rate": 2.637e-05, "loss": 0.3815, "step": 2363 }, { "epoch": 214.96969696969697, "grad_norm": 0.30936163663864136, "learning_rate": 2.6360000000000002e-05, "loss": 0.3916, "step": 2364 }, { "epoch": 215.0, "grad_norm": 0.5225114822387695, "learning_rate": 2.6350000000000004e-05, "loss": 0.3516, "step": 2365 }, { "epoch": 215.0969696969697, "grad_norm": 0.3306688964366913, "learning_rate": 2.6340000000000002e-05, "loss": 0.3982, "step": 2366 }, { "epoch": 215.1939393939394, "grad_norm": 0.3688218891620636, "learning_rate": 2.633e-05, "loss": 0.3643, "step": 2367 }, { "epoch": 215.29090909090908, "grad_norm": 0.2552226483821869, "learning_rate": 2.632e-05, "loss": 0.3626, "step": 2368 }, { "epoch": 215.38787878787878, "grad_norm": 0.28460806608200073, "learning_rate": 2.631e-05, "loss": 0.3908, "step": 2369 }, { "epoch": 215.4848484848485, "grad_norm": 0.3210669755935669, "learning_rate": 2.6300000000000002e-05, "loss": 0.3495, "step": 2370 }, { "epoch": 215.4848484848485, "eval_loss": 0.4424516260623932, "eval_runtime": 2.1474, "eval_samples_per_second": 25.612, "eval_steps_per_second": 3.26, "step": 2370 }, { "epoch": 215.5818181818182, "grad_norm": 0.2910328209400177, "learning_rate": 2.6290000000000004e-05, "loss": 0.3904, "step": 2371 }, { "epoch": 215.6787878787879, "grad_norm": 0.2920510768890381, "learning_rate": 2.628e-05, "loss": 0.3432, "step": 2372 }, { "epoch": 215.77575757575758, "grad_norm": 0.2948629856109619, "learning_rate": 2.627e-05, "loss": 0.3663, "step": 2373 }, { "epoch": 215.87272727272727, "grad_norm": 0.2976299226284027, "learning_rate": 2.6260000000000003e-05, "loss": 0.3917, "step": 2374 }, { "epoch": 215.96969696969697, "grad_norm": 0.2646479308605194, "learning_rate": 2.625e-05, "loss": 0.3427, "step": 2375 }, { "epoch": 216.0, "grad_norm": 0.5189900994300842, "learning_rate": 2.6240000000000003e-05, "loss": 0.386, "step": 2376 }, { "epoch": 216.0969696969697, "grad_norm": 0.28399553894996643, "learning_rate": 2.6229999999999998e-05, "loss": 0.3693, "step": 2377 }, { "epoch": 216.1939393939394, "grad_norm": 0.23609177768230438, "learning_rate": 2.622e-05, "loss": 0.3611, "step": 2378 }, { "epoch": 216.29090909090908, "grad_norm": 0.3684936463832855, "learning_rate": 2.621e-05, "loss": 0.3526, "step": 2379 }, { "epoch": 216.38787878787878, "grad_norm": 0.28553107380867004, "learning_rate": 2.6200000000000003e-05, "loss": 0.3951, "step": 2380 }, { "epoch": 216.38787878787878, "eval_loss": 0.44313111901283264, "eval_runtime": 2.1413, "eval_samples_per_second": 25.686, "eval_steps_per_second": 3.269, "step": 2380 }, { "epoch": 216.4848484848485, "grad_norm": 0.3319821357727051, "learning_rate": 2.6190000000000005e-05, "loss": 0.3954, "step": 2381 }, { "epoch": 216.5818181818182, "grad_norm": 0.3223443925380707, "learning_rate": 2.618e-05, "loss": 0.3477, "step": 2382 }, { "epoch": 216.6787878787879, "grad_norm": 0.28395891189575195, "learning_rate": 2.617e-05, "loss": 0.349, "step": 2383 }, { "epoch": 216.77575757575758, "grad_norm": 0.2875801920890808, "learning_rate": 2.616e-05, "loss": 0.383, "step": 2384 }, { "epoch": 216.87272727272727, "grad_norm": 0.2594863772392273, "learning_rate": 2.6150000000000002e-05, "loss": 0.3727, "step": 2385 }, { "epoch": 216.96969696969697, "grad_norm": 0.2604277431964874, "learning_rate": 2.6140000000000004e-05, "loss": 0.3848, "step": 2386 }, { "epoch": 217.0, "grad_norm": 0.5404815673828125, "learning_rate": 2.613e-05, "loss": 0.3486, "step": 2387 }, { "epoch": 217.0969696969697, "grad_norm": 0.3194997012615204, "learning_rate": 2.612e-05, "loss": 0.4172, "step": 2388 }, { "epoch": 217.1939393939394, "grad_norm": 0.29180049896240234, "learning_rate": 2.6110000000000002e-05, "loss": 0.3639, "step": 2389 }, { "epoch": 217.29090909090908, "grad_norm": 0.24527181684970856, "learning_rate": 2.61e-05, "loss": 0.3385, "step": 2390 }, { "epoch": 217.29090909090908, "eval_loss": 0.44199734926223755, "eval_runtime": 2.1449, "eval_samples_per_second": 25.642, "eval_steps_per_second": 3.263, "step": 2390 }, { "epoch": 217.38787878787878, "grad_norm": 0.3221138119697571, "learning_rate": 2.6090000000000003e-05, "loss": 0.3591, "step": 2391 }, { "epoch": 217.4848484848485, "grad_norm": 0.326957643032074, "learning_rate": 2.6079999999999998e-05, "loss": 0.3872, "step": 2392 }, { "epoch": 217.5818181818182, "grad_norm": 0.30668917298316956, "learning_rate": 2.607e-05, "loss": 0.3616, "step": 2393 }, { "epoch": 217.6787878787879, "grad_norm": 0.29979145526885986, "learning_rate": 2.606e-05, "loss": 0.3654, "step": 2394 }, { "epoch": 217.77575757575758, "grad_norm": 0.24426676332950592, "learning_rate": 2.6050000000000003e-05, "loss": 0.3708, "step": 2395 }, { "epoch": 217.87272727272727, "grad_norm": 0.32726359367370605, "learning_rate": 2.6040000000000005e-05, "loss": 0.3694, "step": 2396 }, { "epoch": 217.96969696969697, "grad_norm": 0.27767452597618103, "learning_rate": 2.603e-05, "loss": 0.3707, "step": 2397 }, { "epoch": 218.0, "grad_norm": 0.4858188331127167, "learning_rate": 2.602e-05, "loss": 0.3486, "step": 2398 }, { "epoch": 218.0969696969697, "grad_norm": 0.2870570123195648, "learning_rate": 2.601e-05, "loss": 0.3497, "step": 2399 }, { "epoch": 218.1939393939394, "grad_norm": 0.3260629177093506, "learning_rate": 2.6000000000000002e-05, "loss": 0.3686, "step": 2400 }, { "epoch": 218.1939393939394, "eval_loss": 0.44243302941322327, "eval_runtime": 2.1191, "eval_samples_per_second": 25.954, "eval_steps_per_second": 3.303, "step": 2400 }, { "epoch": 218.29090909090908, "grad_norm": 0.2818470597267151, "learning_rate": 2.5990000000000004e-05, "loss": 0.3307, "step": 2401 }, { "epoch": 218.38787878787878, "grad_norm": 0.2645418047904968, "learning_rate": 2.598e-05, "loss": 0.3749, "step": 2402 }, { "epoch": 218.4848484848485, "grad_norm": 0.28982293605804443, "learning_rate": 2.597e-05, "loss": 0.3416, "step": 2403 }, { "epoch": 218.5818181818182, "grad_norm": 0.32691192626953125, "learning_rate": 2.5960000000000002e-05, "loss": 0.3879, "step": 2404 }, { "epoch": 218.6787878787879, "grad_norm": 0.39779746532440186, "learning_rate": 2.595e-05, "loss": 0.339, "step": 2405 }, { "epoch": 218.77575757575758, "grad_norm": 0.3154836893081665, "learning_rate": 2.5940000000000002e-05, "loss": 0.3836, "step": 2406 }, { "epoch": 218.87272727272727, "grad_norm": 0.26961442828178406, "learning_rate": 2.5929999999999997e-05, "loss": 0.4092, "step": 2407 }, { "epoch": 218.96969696969697, "grad_norm": 0.2830916941165924, "learning_rate": 2.592e-05, "loss": 0.3925, "step": 2408 }, { "epoch": 219.0, "grad_norm": 0.4212952256202698, "learning_rate": 2.591e-05, "loss": 0.4103, "step": 2409 }, { "epoch": 219.0969696969697, "grad_norm": 0.2978769540786743, "learning_rate": 2.5900000000000003e-05, "loss": 0.3848, "step": 2410 }, { "epoch": 219.0969696969697, "eval_loss": 0.4409344792366028, "eval_runtime": 2.1366, "eval_samples_per_second": 25.741, "eval_steps_per_second": 3.276, "step": 2410 }, { "epoch": 219.1939393939394, "grad_norm": 0.30883023142814636, "learning_rate": 2.5890000000000005e-05, "loss": 0.3975, "step": 2411 }, { "epoch": 219.29090909090908, "grad_norm": 0.28611019253730774, "learning_rate": 2.588e-05, "loss": 0.3675, "step": 2412 }, { "epoch": 219.38787878787878, "grad_norm": 0.31392866373062134, "learning_rate": 2.587e-05, "loss": 0.387, "step": 2413 }, { "epoch": 219.4848484848485, "grad_norm": 0.3225494623184204, "learning_rate": 2.586e-05, "loss": 0.3736, "step": 2414 }, { "epoch": 219.5818181818182, "grad_norm": 0.35923585295677185, "learning_rate": 2.585e-05, "loss": 0.386, "step": 2415 }, { "epoch": 219.6787878787879, "grad_norm": 0.3180157244205475, "learning_rate": 2.5840000000000003e-05, "loss": 0.3752, "step": 2416 }, { "epoch": 219.77575757575758, "grad_norm": 0.275451123714447, "learning_rate": 2.583e-05, "loss": 0.3636, "step": 2417 }, { "epoch": 219.87272727272727, "grad_norm": 0.25374045968055725, "learning_rate": 2.582e-05, "loss": 0.3155, "step": 2418 }, { "epoch": 219.96969696969697, "grad_norm": 0.2972322404384613, "learning_rate": 2.5810000000000002e-05, "loss": 0.3356, "step": 2419 }, { "epoch": 220.0, "grad_norm": 0.4347028434276581, "learning_rate": 2.58e-05, "loss": 0.3743, "step": 2420 }, { "epoch": 220.0, "eval_loss": 0.441646546125412, "eval_runtime": 2.1241, "eval_samples_per_second": 25.893, "eval_steps_per_second": 3.295, "step": 2420 }, { "epoch": 220.0969696969697, "grad_norm": 0.32299545407295227, "learning_rate": 2.5790000000000002e-05, "loss": 0.345, "step": 2421 }, { "epoch": 220.1939393939394, "grad_norm": 0.3762829601764679, "learning_rate": 2.5779999999999997e-05, "loss": 0.3814, "step": 2422 }, { "epoch": 220.29090909090908, "grad_norm": 0.2898702025413513, "learning_rate": 2.577e-05, "loss": 0.3791, "step": 2423 }, { "epoch": 220.38787878787878, "grad_norm": 0.267051637172699, "learning_rate": 2.576e-05, "loss": 0.3161, "step": 2424 }, { "epoch": 220.4848484848485, "grad_norm": 0.24987362325191498, "learning_rate": 2.5750000000000002e-05, "loss": 0.3901, "step": 2425 }, { "epoch": 220.5818181818182, "grad_norm": 0.26668789982795715, "learning_rate": 2.5740000000000004e-05, "loss": 0.3786, "step": 2426 }, { "epoch": 220.6787878787879, "grad_norm": 0.2954038679599762, "learning_rate": 2.573e-05, "loss": 0.3828, "step": 2427 }, { "epoch": 220.77575757575758, "grad_norm": 0.2518426775932312, "learning_rate": 2.572e-05, "loss": 0.3689, "step": 2428 }, { "epoch": 220.87272727272727, "grad_norm": 0.34907805919647217, "learning_rate": 2.571e-05, "loss": 0.3817, "step": 2429 }, { "epoch": 220.96969696969697, "grad_norm": 0.27415215969085693, "learning_rate": 2.57e-05, "loss": 0.3515, "step": 2430 }, { "epoch": 220.96969696969697, "eval_loss": 0.44070032238960266, "eval_runtime": 2.1477, "eval_samples_per_second": 25.609, "eval_steps_per_second": 3.259, "step": 2430 }, { "epoch": 221.0, "grad_norm": 0.4006612300872803, "learning_rate": 2.5690000000000003e-05, "loss": 0.397, "step": 2431 }, { "epoch": 221.0969696969697, "grad_norm": 0.2683582901954651, "learning_rate": 2.5679999999999998e-05, "loss": 0.3842, "step": 2432 }, { "epoch": 221.1939393939394, "grad_norm": 0.30595675110816956, "learning_rate": 2.567e-05, "loss": 0.3886, "step": 2433 }, { "epoch": 221.29090909090908, "grad_norm": 0.2578451931476593, "learning_rate": 2.566e-05, "loss": 0.3531, "step": 2434 }, { "epoch": 221.38787878787878, "grad_norm": 0.2802889049053192, "learning_rate": 2.5650000000000003e-05, "loss": 0.3652, "step": 2435 }, { "epoch": 221.4848484848485, "grad_norm": 0.3214050829410553, "learning_rate": 2.5640000000000002e-05, "loss": 0.3754, "step": 2436 }, { "epoch": 221.5818181818182, "grad_norm": 0.2835405766963959, "learning_rate": 2.5629999999999997e-05, "loss": 0.3837, "step": 2437 }, { "epoch": 221.6787878787879, "grad_norm": 0.24525125324726105, "learning_rate": 2.562e-05, "loss": 0.3381, "step": 2438 }, { "epoch": 221.77575757575758, "grad_norm": 0.31164395809173584, "learning_rate": 2.561e-05, "loss": 0.3975, "step": 2439 }, { "epoch": 221.87272727272727, "grad_norm": 0.2807594835758209, "learning_rate": 2.5600000000000002e-05, "loss": 0.351, "step": 2440 }, { "epoch": 221.87272727272727, "eval_loss": 0.4406813085079193, "eval_runtime": 2.1591, "eval_samples_per_second": 25.474, "eval_steps_per_second": 3.242, "step": 2440 }, { "epoch": 221.96969696969697, "grad_norm": 0.2830051779747009, "learning_rate": 2.5590000000000004e-05, "loss": 0.33, "step": 2441 }, { "epoch": 222.0, "grad_norm": 0.43788567185401917, "learning_rate": 2.5580000000000002e-05, "loss": 0.4056, "step": 2442 }, { "epoch": 222.0969696969697, "grad_norm": 0.26734405755996704, "learning_rate": 2.557e-05, "loss": 0.3465, "step": 2443 }, { "epoch": 222.1939393939394, "grad_norm": 0.28306853771209717, "learning_rate": 2.556e-05, "loss": 0.3666, "step": 2444 }, { "epoch": 222.29090909090908, "grad_norm": 0.2725483179092407, "learning_rate": 2.555e-05, "loss": 0.378, "step": 2445 }, { "epoch": 222.38787878787878, "grad_norm": 0.31050193309783936, "learning_rate": 2.5540000000000003e-05, "loss": 0.3966, "step": 2446 }, { "epoch": 222.4848484848485, "grad_norm": 0.2954963743686676, "learning_rate": 2.5530000000000005e-05, "loss": 0.3789, "step": 2447 }, { "epoch": 222.5818181818182, "grad_norm": 0.27009761333465576, "learning_rate": 2.552e-05, "loss": 0.3528, "step": 2448 }, { "epoch": 222.6787878787879, "grad_norm": 0.2876873314380646, "learning_rate": 2.551e-05, "loss": 0.3723, "step": 2449 }, { "epoch": 222.77575757575758, "grad_norm": 0.2781578302383423, "learning_rate": 2.5500000000000003e-05, "loss": 0.3448, "step": 2450 }, { "epoch": 222.77575757575758, "eval_loss": 0.4403097629547119, "eval_runtime": 2.1503, "eval_samples_per_second": 25.578, "eval_steps_per_second": 3.255, "step": 2450 }, { "epoch": 222.87272727272727, "grad_norm": 0.32250741124153137, "learning_rate": 2.549e-05, "loss": 0.3737, "step": 2451 }, { "epoch": 222.96969696969697, "grad_norm": 0.25998982787132263, "learning_rate": 2.5480000000000003e-05, "loss": 0.3628, "step": 2452 }, { "epoch": 223.0, "grad_norm": 0.6483612656593323, "learning_rate": 2.547e-05, "loss": 0.3658, "step": 2453 }, { "epoch": 223.0969696969697, "grad_norm": 0.27064916491508484, "learning_rate": 2.546e-05, "loss": 0.324, "step": 2454 }, { "epoch": 223.1939393939394, "grad_norm": 0.3874616324901581, "learning_rate": 2.5450000000000002e-05, "loss": 0.3746, "step": 2455 }, { "epoch": 223.29090909090908, "grad_norm": 0.2780890464782715, "learning_rate": 2.5440000000000004e-05, "loss": 0.3736, "step": 2456 }, { "epoch": 223.38787878787878, "grad_norm": 0.2915639877319336, "learning_rate": 2.5430000000000002e-05, "loss": 0.3724, "step": 2457 }, { "epoch": 223.4848484848485, "grad_norm": 0.2976807951927185, "learning_rate": 2.542e-05, "loss": 0.3596, "step": 2458 }, { "epoch": 223.5818181818182, "grad_norm": 0.27504071593284607, "learning_rate": 2.541e-05, "loss": 0.326, "step": 2459 }, { "epoch": 223.6787878787879, "grad_norm": 0.2684337794780731, "learning_rate": 2.54e-05, "loss": 0.3707, "step": 2460 }, { "epoch": 223.6787878787879, "eval_loss": 0.44031795859336853, "eval_runtime": 2.1228, "eval_samples_per_second": 25.91, "eval_steps_per_second": 3.298, "step": 2460 }, { "epoch": 223.77575757575758, "grad_norm": 0.2877907156944275, "learning_rate": 2.5390000000000003e-05, "loss": 0.3672, "step": 2461 }, { "epoch": 223.87272727272727, "grad_norm": 0.32059311866760254, "learning_rate": 2.5380000000000004e-05, "loss": 0.4111, "step": 2462 }, { "epoch": 223.96969696969697, "grad_norm": 0.27829068899154663, "learning_rate": 2.537e-05, "loss": 0.3862, "step": 2463 }, { "epoch": 224.0, "grad_norm": 0.5866357684135437, "learning_rate": 2.536e-05, "loss": 0.3742, "step": 2464 }, { "epoch": 224.0969696969697, "grad_norm": 0.2550725042819977, "learning_rate": 2.5350000000000003e-05, "loss": 0.3502, "step": 2465 }, { "epoch": 224.1939393939394, "grad_norm": 0.2818206250667572, "learning_rate": 2.534e-05, "loss": 0.3404, "step": 2466 }, { "epoch": 224.29090909090908, "grad_norm": 0.3186753988265991, "learning_rate": 2.5330000000000003e-05, "loss": 0.3526, "step": 2467 }, { "epoch": 224.38787878787878, "grad_norm": 0.2827731668949127, "learning_rate": 2.5319999999999998e-05, "loss": 0.3615, "step": 2468 }, { "epoch": 224.4848484848485, "grad_norm": 0.2541167438030243, "learning_rate": 2.531e-05, "loss": 0.3522, "step": 2469 }, { "epoch": 224.5818181818182, "grad_norm": 0.2639825642108917, "learning_rate": 2.5300000000000002e-05, "loss": 0.3718, "step": 2470 }, { "epoch": 224.5818181818182, "eval_loss": 0.43977347016334534, "eval_runtime": 2.1394, "eval_samples_per_second": 25.708, "eval_steps_per_second": 3.272, "step": 2470 }, { "epoch": 224.6787878787879, "grad_norm": 0.2934424579143524, "learning_rate": 2.5290000000000004e-05, "loss": 0.414, "step": 2471 }, { "epoch": 224.77575757575758, "grad_norm": 0.3221575915813446, "learning_rate": 2.5280000000000005e-05, "loss": 0.3758, "step": 2472 }, { "epoch": 224.87272727272727, "grad_norm": 0.2983260452747345, "learning_rate": 2.527e-05, "loss": 0.3965, "step": 2473 }, { "epoch": 224.96969696969697, "grad_norm": 0.28553321957588196, "learning_rate": 2.526e-05, "loss": 0.3533, "step": 2474 }, { "epoch": 225.0, "grad_norm": 0.5285137891769409, "learning_rate": 2.525e-05, "loss": 0.3625, "step": 2475 }, { "epoch": 225.0969696969697, "grad_norm": 0.2885406017303467, "learning_rate": 2.5240000000000002e-05, "loss": 0.3286, "step": 2476 }, { "epoch": 225.1939393939394, "grad_norm": 0.28469815850257874, "learning_rate": 2.5230000000000004e-05, "loss": 0.356, "step": 2477 }, { "epoch": 225.29090909090908, "grad_norm": 0.29516857862472534, "learning_rate": 2.522e-05, "loss": 0.328, "step": 2478 }, { "epoch": 225.38787878787878, "grad_norm": 0.2980527877807617, "learning_rate": 2.521e-05, "loss": 0.3609, "step": 2479 }, { "epoch": 225.4848484848485, "grad_norm": 0.32728075981140137, "learning_rate": 2.5200000000000003e-05, "loss": 0.3466, "step": 2480 }, { "epoch": 225.4848484848485, "eval_loss": 0.44033753871917725, "eval_runtime": 2.1249, "eval_samples_per_second": 25.884, "eval_steps_per_second": 3.294, "step": 2480 }, { "epoch": 225.5818181818182, "grad_norm": 0.36214345693588257, "learning_rate": 2.519e-05, "loss": 0.4097, "step": 2481 }, { "epoch": 225.6787878787879, "grad_norm": 0.38986948132514954, "learning_rate": 2.5180000000000003e-05, "loss": 0.3869, "step": 2482 }, { "epoch": 225.77575757575758, "grad_norm": 0.27240610122680664, "learning_rate": 2.5169999999999998e-05, "loss": 0.3773, "step": 2483 }, { "epoch": 225.87272727272727, "grad_norm": 0.2969878017902374, "learning_rate": 2.516e-05, "loss": 0.4009, "step": 2484 }, { "epoch": 225.96969696969697, "grad_norm": 0.3425680994987488, "learning_rate": 2.515e-05, "loss": 0.3723, "step": 2485 }, { "epoch": 226.0, "grad_norm": 0.47936302423477173, "learning_rate": 2.5140000000000003e-05, "loss": 0.3564, "step": 2486 }, { "epoch": 226.0969696969697, "grad_norm": 0.3240700364112854, "learning_rate": 2.5130000000000005e-05, "loss": 0.3674, "step": 2487 }, { "epoch": 226.1939393939394, "grad_norm": 0.2965087890625, "learning_rate": 2.512e-05, "loss": 0.3758, "step": 2488 }, { "epoch": 226.29090909090908, "grad_norm": 0.2678585350513458, "learning_rate": 2.5110000000000002e-05, "loss": 0.3782, "step": 2489 }, { "epoch": 226.38787878787878, "grad_norm": 0.3514326512813568, "learning_rate": 2.51e-05, "loss": 0.2942, "step": 2490 }, { "epoch": 226.38787878787878, "eval_loss": 0.4388936758041382, "eval_runtime": 2.1285, "eval_samples_per_second": 25.84, "eval_steps_per_second": 3.289, "step": 2490 }, { "epoch": 226.4848484848485, "grad_norm": 0.343416303396225, "learning_rate": 2.5090000000000002e-05, "loss": 0.3645, "step": 2491 }, { "epoch": 226.5818181818182, "grad_norm": 0.30410099029541016, "learning_rate": 2.5080000000000004e-05, "loss": 0.3294, "step": 2492 }, { "epoch": 226.6787878787879, "grad_norm": 0.26536622643470764, "learning_rate": 2.507e-05, "loss": 0.3534, "step": 2493 }, { "epoch": 226.77575757575758, "grad_norm": 0.27707982063293457, "learning_rate": 2.506e-05, "loss": 0.401, "step": 2494 }, { "epoch": 226.87272727272727, "grad_norm": 0.294041246175766, "learning_rate": 2.5050000000000002e-05, "loss": 0.4049, "step": 2495 }, { "epoch": 226.96969696969697, "grad_norm": 0.31618446111679077, "learning_rate": 2.504e-05, "loss": 0.4065, "step": 2496 }, { "epoch": 227.0, "grad_norm": 0.6172340512275696, "learning_rate": 2.5030000000000003e-05, "loss": 0.3122, "step": 2497 }, { "epoch": 227.0969696969697, "grad_norm": 0.24138444662094116, "learning_rate": 2.5019999999999998e-05, "loss": 0.3704, "step": 2498 }, { "epoch": 227.1939393939394, "grad_norm": 0.33306029438972473, "learning_rate": 2.501e-05, "loss": 0.4177, "step": 2499 }, { "epoch": 227.29090909090908, "grad_norm": 0.40666037797927856, "learning_rate": 2.5e-05, "loss": 0.3513, "step": 2500 }, { "epoch": 227.29090909090908, "eval_loss": 0.4396165609359741, "eval_runtime": 2.1224, "eval_samples_per_second": 25.914, "eval_steps_per_second": 3.298, "step": 2500 }, { "epoch": 227.38787878787878, "grad_norm": 0.29784542322158813, "learning_rate": 2.4990000000000003e-05, "loss": 0.3711, "step": 2501 }, { "epoch": 227.4848484848485, "grad_norm": 0.46334195137023926, "learning_rate": 2.498e-05, "loss": 0.3346, "step": 2502 }, { "epoch": 227.5818181818182, "grad_norm": 0.3165861964225769, "learning_rate": 2.4970000000000003e-05, "loss": 0.333, "step": 2503 }, { "epoch": 227.6787878787879, "grad_norm": 0.30741187930107117, "learning_rate": 2.496e-05, "loss": 0.372, "step": 2504 }, { "epoch": 227.77575757575758, "grad_norm": 0.2982431948184967, "learning_rate": 2.495e-05, "loss": 0.3405, "step": 2505 }, { "epoch": 227.87272727272727, "grad_norm": 0.32411104440689087, "learning_rate": 2.4940000000000002e-05, "loss": 0.4046, "step": 2506 }, { "epoch": 227.96969696969697, "grad_norm": 0.3008289635181427, "learning_rate": 2.493e-05, "loss": 0.3571, "step": 2507 }, { "epoch": 228.0, "grad_norm": 0.4273081123828888, "learning_rate": 2.4920000000000002e-05, "loss": 0.3727, "step": 2508 }, { "epoch": 228.0969696969697, "grad_norm": 0.29499098658561707, "learning_rate": 2.491e-05, "loss": 0.386, "step": 2509 }, { "epoch": 228.1939393939394, "grad_norm": 0.263407438993454, "learning_rate": 2.4900000000000002e-05, "loss": 0.3541, "step": 2510 }, { "epoch": 228.1939393939394, "eval_loss": 0.43920692801475525, "eval_runtime": 2.114, "eval_samples_per_second": 26.018, "eval_steps_per_second": 3.311, "step": 2510 }, { "epoch": 228.29090909090908, "grad_norm": 0.2716670036315918, "learning_rate": 2.489e-05, "loss": 0.3323, "step": 2511 }, { "epoch": 228.38787878787878, "grad_norm": 0.3222666084766388, "learning_rate": 2.488e-05, "loss": 0.3869, "step": 2512 }, { "epoch": 228.4848484848485, "grad_norm": 0.2719743251800537, "learning_rate": 2.487e-05, "loss": 0.3589, "step": 2513 }, { "epoch": 228.5818181818182, "grad_norm": 0.35152000188827515, "learning_rate": 2.486e-05, "loss": 0.3539, "step": 2514 }, { "epoch": 228.6787878787879, "grad_norm": 0.2843145728111267, "learning_rate": 2.485e-05, "loss": 0.3452, "step": 2515 }, { "epoch": 228.77575757575758, "grad_norm": 0.293863981962204, "learning_rate": 2.4840000000000003e-05, "loss": 0.3967, "step": 2516 }, { "epoch": 228.87272727272727, "grad_norm": 0.26122814416885376, "learning_rate": 2.483e-05, "loss": 0.3519, "step": 2517 }, { "epoch": 228.96969696969697, "grad_norm": 0.28403446078300476, "learning_rate": 2.4820000000000003e-05, "loss": 0.3486, "step": 2518 }, { "epoch": 229.0, "grad_norm": 0.5891252756118774, "learning_rate": 2.481e-05, "loss": 0.4587, "step": 2519 }, { "epoch": 229.0969696969697, "grad_norm": 0.26723888516426086, "learning_rate": 2.48e-05, "loss": 0.3911, "step": 2520 }, { "epoch": 229.0969696969697, "eval_loss": 0.43881770968437195, "eval_runtime": 2.132, "eval_samples_per_second": 25.797, "eval_steps_per_second": 3.283, "step": 2520 }, { "epoch": 229.1939393939394, "grad_norm": 0.30375170707702637, "learning_rate": 2.479e-05, "loss": 0.3853, "step": 2521 }, { "epoch": 229.29090909090908, "grad_norm": 0.2493750900030136, "learning_rate": 2.478e-05, "loss": 0.364, "step": 2522 }, { "epoch": 229.38787878787878, "grad_norm": 0.2770415246486664, "learning_rate": 2.4770000000000002e-05, "loss": 0.3524, "step": 2523 }, { "epoch": 229.4848484848485, "grad_norm": 0.3257941007614136, "learning_rate": 2.476e-05, "loss": 0.3724, "step": 2524 }, { "epoch": 229.5818181818182, "grad_norm": 0.29938164353370667, "learning_rate": 2.4750000000000002e-05, "loss": 0.3349, "step": 2525 }, { "epoch": 229.6787878787879, "grad_norm": 0.28248274326324463, "learning_rate": 2.4740000000000004e-05, "loss": 0.3504, "step": 2526 }, { "epoch": 229.77575757575758, "grad_norm": 0.27092963457107544, "learning_rate": 2.473e-05, "loss": 0.3698, "step": 2527 }, { "epoch": 229.87272727272727, "grad_norm": 0.28257593512535095, "learning_rate": 2.472e-05, "loss": 0.3456, "step": 2528 }, { "epoch": 229.96969696969697, "grad_norm": 0.2728407680988312, "learning_rate": 2.471e-05, "loss": 0.3744, "step": 2529 }, { "epoch": 230.0, "grad_norm": 0.6111608147621155, "learning_rate": 2.47e-05, "loss": 0.378, "step": 2530 }, { "epoch": 230.0, "eval_loss": 0.438978910446167, "eval_runtime": 2.1304, "eval_samples_per_second": 25.817, "eval_steps_per_second": 3.286, "step": 2530 }, { "epoch": 230.0969696969697, "grad_norm": 0.2594529688358307, "learning_rate": 2.4690000000000002e-05, "loss": 0.369, "step": 2531 }, { "epoch": 230.1939393939394, "grad_norm": 0.25597700476646423, "learning_rate": 2.468e-05, "loss": 0.344, "step": 2532 }, { "epoch": 230.29090909090908, "grad_norm": 0.307706356048584, "learning_rate": 2.4670000000000003e-05, "loss": 0.3455, "step": 2533 }, { "epoch": 230.38787878787878, "grad_norm": 0.2459164261817932, "learning_rate": 2.466e-05, "loss": 0.3559, "step": 2534 }, { "epoch": 230.4848484848485, "grad_norm": 0.3313330113887787, "learning_rate": 2.465e-05, "loss": 0.3781, "step": 2535 }, { "epoch": 230.5818181818182, "grad_norm": 0.3011106550693512, "learning_rate": 2.464e-05, "loss": 0.3497, "step": 2536 }, { "epoch": 230.6787878787879, "grad_norm": 0.3560205399990082, "learning_rate": 2.463e-05, "loss": 0.3793, "step": 2537 }, { "epoch": 230.77575757575758, "grad_norm": 0.2969333529472351, "learning_rate": 2.462e-05, "loss": 0.3979, "step": 2538 }, { "epoch": 230.87272727272727, "grad_norm": 0.32418274879455566, "learning_rate": 2.4610000000000003e-05, "loss": 0.3774, "step": 2539 }, { "epoch": 230.96969696969697, "grad_norm": 0.27839162945747375, "learning_rate": 2.46e-05, "loss": 0.3585, "step": 2540 }, { "epoch": 230.96969696969697, "eval_loss": 0.4386548399925232, "eval_runtime": 2.1313, "eval_samples_per_second": 25.806, "eval_steps_per_second": 3.284, "step": 2540 }, { "epoch": 231.0, "grad_norm": 0.6563290953636169, "learning_rate": 2.4590000000000003e-05, "loss": 0.3148, "step": 2541 }, { "epoch": 231.0969696969697, "grad_norm": 0.28322136402130127, "learning_rate": 2.4580000000000002e-05, "loss": 0.3882, "step": 2542 }, { "epoch": 231.1939393939394, "grad_norm": 0.4256789982318878, "learning_rate": 2.457e-05, "loss": 0.3609, "step": 2543 }, { "epoch": 231.29090909090908, "grad_norm": 0.30935171246528625, "learning_rate": 2.4560000000000002e-05, "loss": 0.3702, "step": 2544 }, { "epoch": 231.38787878787878, "grad_norm": 0.3351457417011261, "learning_rate": 2.455e-05, "loss": 0.3499, "step": 2545 }, { "epoch": 231.4848484848485, "grad_norm": 0.38497939705848694, "learning_rate": 2.4540000000000002e-05, "loss": 0.3414, "step": 2546 }, { "epoch": 231.5818181818182, "grad_norm": 0.33501312136650085, "learning_rate": 2.453e-05, "loss": 0.3999, "step": 2547 }, { "epoch": 231.6787878787879, "grad_norm": 0.321706086397171, "learning_rate": 2.4520000000000002e-05, "loss": 0.3548, "step": 2548 }, { "epoch": 231.77575757575758, "grad_norm": 0.2958731949329376, "learning_rate": 2.451e-05, "loss": 0.3296, "step": 2549 }, { "epoch": 231.87272727272727, "grad_norm": 0.2821996510028839, "learning_rate": 2.45e-05, "loss": 0.3653, "step": 2550 }, { "epoch": 231.87272727272727, "eval_loss": 0.43812915682792664, "eval_runtime": 2.134, "eval_samples_per_second": 25.773, "eval_steps_per_second": 3.28, "step": 2550 }, { "epoch": 231.96969696969697, "grad_norm": 0.26832205057144165, "learning_rate": 2.449e-05, "loss": 0.3869, "step": 2551 }, { "epoch": 232.0, "grad_norm": 0.442310094833374, "learning_rate": 2.448e-05, "loss": 0.3305, "step": 2552 }, { "epoch": 232.0969696969697, "grad_norm": 0.2917024791240692, "learning_rate": 2.447e-05, "loss": 0.3526, "step": 2553 }, { "epoch": 232.1939393939394, "grad_norm": 0.2630578279495239, "learning_rate": 2.4460000000000003e-05, "loss": 0.3713, "step": 2554 }, { "epoch": 232.29090909090908, "grad_norm": 0.2642268240451813, "learning_rate": 2.445e-05, "loss": 0.3782, "step": 2555 }, { "epoch": 232.38787878787878, "grad_norm": 0.33103933930397034, "learning_rate": 2.4440000000000003e-05, "loss": 0.4088, "step": 2556 }, { "epoch": 232.4848484848485, "grad_norm": 0.33370935916900635, "learning_rate": 2.443e-05, "loss": 0.352, "step": 2557 }, { "epoch": 232.5818181818182, "grad_norm": 0.31459617614746094, "learning_rate": 2.442e-05, "loss": 0.3471, "step": 2558 }, { "epoch": 232.6787878787879, "grad_norm": 0.315646231174469, "learning_rate": 2.4410000000000002e-05, "loss": 0.3721, "step": 2559 }, { "epoch": 232.77575757575758, "grad_norm": 0.27737879753112793, "learning_rate": 2.44e-05, "loss": 0.3508, "step": 2560 }, { "epoch": 232.77575757575758, "eval_loss": 0.43808242678642273, "eval_runtime": 2.1296, "eval_samples_per_second": 25.827, "eval_steps_per_second": 3.287, "step": 2560 }, { "epoch": 232.87272727272727, "grad_norm": 0.32893404364585876, "learning_rate": 2.4390000000000002e-05, "loss": 0.3614, "step": 2561 }, { "epoch": 232.96969696969697, "grad_norm": 0.28458207845687866, "learning_rate": 2.438e-05, "loss": 0.3461, "step": 2562 }, { "epoch": 233.0, "grad_norm": 0.5089003443717957, "learning_rate": 2.4370000000000002e-05, "loss": 0.3414, "step": 2563 }, { "epoch": 233.0969696969697, "grad_norm": 0.32916566729545593, "learning_rate": 2.4360000000000004e-05, "loss": 0.3614, "step": 2564 }, { "epoch": 233.1939393939394, "grad_norm": 0.30260559916496277, "learning_rate": 2.435e-05, "loss": 0.363, "step": 2565 }, { "epoch": 233.29090909090908, "grad_norm": 0.2912161350250244, "learning_rate": 2.434e-05, "loss": 0.365, "step": 2566 }, { "epoch": 233.38787878787878, "grad_norm": 0.2847774922847748, "learning_rate": 2.433e-05, "loss": 0.3574, "step": 2567 }, { "epoch": 233.4848484848485, "grad_norm": 0.3008267879486084, "learning_rate": 2.432e-05, "loss": 0.3781, "step": 2568 }, { "epoch": 233.5818181818182, "grad_norm": 0.29215264320373535, "learning_rate": 2.4310000000000003e-05, "loss": 0.3462, "step": 2569 }, { "epoch": 233.6787878787879, "grad_norm": 0.39694666862487793, "learning_rate": 2.43e-05, "loss": 0.3789, "step": 2570 }, { "epoch": 233.6787878787879, "eval_loss": 0.4377782642841339, "eval_runtime": 2.1486, "eval_samples_per_second": 25.599, "eval_steps_per_second": 3.258, "step": 2570 }, { "epoch": 233.77575757575758, "grad_norm": 0.2967980206012726, "learning_rate": 2.4290000000000003e-05, "loss": 0.3664, "step": 2571 }, { "epoch": 233.87272727272727, "grad_norm": 0.2728116512298584, "learning_rate": 2.428e-05, "loss": 0.3658, "step": 2572 }, { "epoch": 233.96969696969697, "grad_norm": 0.3128872811794281, "learning_rate": 2.427e-05, "loss": 0.3539, "step": 2573 }, { "epoch": 234.0, "grad_norm": 0.36063623428344727, "learning_rate": 2.426e-05, "loss": 0.3449, "step": 2574 }, { "epoch": 234.0969696969697, "grad_norm": 0.24417349696159363, "learning_rate": 2.425e-05, "loss": 0.3253, "step": 2575 }, { "epoch": 234.1939393939394, "grad_norm": 0.40090200304985046, "learning_rate": 2.4240000000000002e-05, "loss": 0.3953, "step": 2576 }, { "epoch": 234.29090909090908, "grad_norm": 0.2958609461784363, "learning_rate": 2.423e-05, "loss": 0.3593, "step": 2577 }, { "epoch": 234.38787878787878, "grad_norm": 0.2797829210758209, "learning_rate": 2.4220000000000002e-05, "loss": 0.3864, "step": 2578 }, { "epoch": 234.4848484848485, "grad_norm": 0.2914266288280487, "learning_rate": 2.4210000000000004e-05, "loss": 0.3434, "step": 2579 }, { "epoch": 234.5818181818182, "grad_norm": 0.27718478441238403, "learning_rate": 2.4200000000000002e-05, "loss": 0.3629, "step": 2580 }, { "epoch": 234.5818181818182, "eval_loss": 0.43738406896591187, "eval_runtime": 2.1359, "eval_samples_per_second": 25.751, "eval_steps_per_second": 3.277, "step": 2580 }, { "epoch": 234.6787878787879, "grad_norm": 0.2699889540672302, "learning_rate": 2.419e-05, "loss": 0.3434, "step": 2581 }, { "epoch": 234.77575757575758, "grad_norm": 0.2644093930721283, "learning_rate": 2.418e-05, "loss": 0.3681, "step": 2582 }, { "epoch": 234.87272727272727, "grad_norm": 0.2939375638961792, "learning_rate": 2.417e-05, "loss": 0.3763, "step": 2583 }, { "epoch": 234.96969696969697, "grad_norm": 0.2855975925922394, "learning_rate": 2.4160000000000002e-05, "loss": 0.359, "step": 2584 }, { "epoch": 235.0, "grad_norm": 0.4643716514110565, "learning_rate": 2.415e-05, "loss": 0.3744, "step": 2585 }, { "epoch": 235.0969696969697, "grad_norm": 0.34113290905952454, "learning_rate": 2.4140000000000003e-05, "loss": 0.4155, "step": 2586 }, { "epoch": 235.1939393939394, "grad_norm": 0.273038774728775, "learning_rate": 2.413e-05, "loss": 0.3692, "step": 2587 }, { "epoch": 235.29090909090908, "grad_norm": 0.3686050772666931, "learning_rate": 2.412e-05, "loss": 0.3418, "step": 2588 }, { "epoch": 235.38787878787878, "grad_norm": 0.27432775497436523, "learning_rate": 2.411e-05, "loss": 0.3567, "step": 2589 }, { "epoch": 235.4848484848485, "grad_norm": 0.27832579612731934, "learning_rate": 2.41e-05, "loss": 0.3441, "step": 2590 }, { "epoch": 235.4848484848485, "eval_loss": 0.4376099109649658, "eval_runtime": 2.1414, "eval_samples_per_second": 25.685, "eval_steps_per_second": 3.269, "step": 2590 }, { "epoch": 235.5818181818182, "grad_norm": 0.29833900928497314, "learning_rate": 2.409e-05, "loss": 0.3467, "step": 2591 }, { "epoch": 235.6787878787879, "grad_norm": 0.29584160447120667, "learning_rate": 2.408e-05, "loss": 0.377, "step": 2592 }, { "epoch": 235.77575757575758, "grad_norm": 0.3445207178592682, "learning_rate": 2.407e-05, "loss": 0.3735, "step": 2593 }, { "epoch": 235.87272727272727, "grad_norm": 0.304282009601593, "learning_rate": 2.4060000000000003e-05, "loss": 0.3469, "step": 2594 }, { "epoch": 235.96969696969697, "grad_norm": 0.24318461120128632, "learning_rate": 2.4050000000000002e-05, "loss": 0.3524, "step": 2595 }, { "epoch": 236.0, "grad_norm": 0.5731469392776489, "learning_rate": 2.404e-05, "loss": 0.3424, "step": 2596 }, { "epoch": 236.0969696969697, "grad_norm": 0.2819751501083374, "learning_rate": 2.4030000000000002e-05, "loss": 0.3142, "step": 2597 }, { "epoch": 236.1939393939394, "grad_norm": 0.2591220438480377, "learning_rate": 2.402e-05, "loss": 0.3156, "step": 2598 }, { "epoch": 236.29090909090908, "grad_norm": 0.29000306129455566, "learning_rate": 2.4010000000000002e-05, "loss": 0.3941, "step": 2599 }, { "epoch": 236.38787878787878, "grad_norm": 0.2846730947494507, "learning_rate": 2.4e-05, "loss": 0.3397, "step": 2600 }, { "epoch": 236.38787878787878, "eval_loss": 0.437418133020401, "eval_runtime": 2.123, "eval_samples_per_second": 25.907, "eval_steps_per_second": 3.297, "step": 2600 }, { "epoch": 236.4848484848485, "grad_norm": 0.32599303126335144, "learning_rate": 2.3990000000000002e-05, "loss": 0.3638, "step": 2601 }, { "epoch": 236.5818181818182, "grad_norm": 0.2827683687210083, "learning_rate": 2.398e-05, "loss": 0.3915, "step": 2602 }, { "epoch": 236.6787878787879, "grad_norm": 0.26976168155670166, "learning_rate": 2.397e-05, "loss": 0.366, "step": 2603 }, { "epoch": 236.77575757575758, "grad_norm": 0.33612337708473206, "learning_rate": 2.396e-05, "loss": 0.3472, "step": 2604 }, { "epoch": 236.87272727272727, "grad_norm": 0.24638354778289795, "learning_rate": 2.395e-05, "loss": 0.3924, "step": 2605 }, { "epoch": 236.96969696969697, "grad_norm": 0.3176722228527069, "learning_rate": 2.394e-05, "loss": 0.3878, "step": 2606 }, { "epoch": 237.0, "grad_norm": 0.49755457043647766, "learning_rate": 2.3930000000000003e-05, "loss": 0.3706, "step": 2607 }, { "epoch": 237.0969696969697, "grad_norm": 0.26535603404045105, "learning_rate": 2.392e-05, "loss": 0.3736, "step": 2608 }, { "epoch": 237.1939393939394, "grad_norm": 0.2597862184047699, "learning_rate": 2.3910000000000003e-05, "loss": 0.3434, "step": 2609 }, { "epoch": 237.29090909090908, "grad_norm": 0.30801284313201904, "learning_rate": 2.39e-05, "loss": 0.3423, "step": 2610 }, { "epoch": 237.29090909090908, "eval_loss": 0.4371388256549835, "eval_runtime": 2.1469, "eval_samples_per_second": 25.618, "eval_steps_per_second": 3.26, "step": 2610 }, { "epoch": 237.38787878787878, "grad_norm": 0.29568618535995483, "learning_rate": 2.389e-05, "loss": 0.353, "step": 2611 }, { "epoch": 237.4848484848485, "grad_norm": 0.2596912384033203, "learning_rate": 2.3880000000000002e-05, "loss": 0.394, "step": 2612 }, { "epoch": 237.5818181818182, "grad_norm": 0.2487613558769226, "learning_rate": 2.387e-05, "loss": 0.3593, "step": 2613 }, { "epoch": 237.6787878787879, "grad_norm": 0.30508145689964294, "learning_rate": 2.3860000000000002e-05, "loss": 0.3823, "step": 2614 }, { "epoch": 237.77575757575758, "grad_norm": 0.3193148970603943, "learning_rate": 2.385e-05, "loss": 0.3531, "step": 2615 }, { "epoch": 237.87272727272727, "grad_norm": 0.3358408808708191, "learning_rate": 2.3840000000000002e-05, "loss": 0.3581, "step": 2616 }, { "epoch": 237.96969696969697, "grad_norm": 0.3238699436187744, "learning_rate": 2.3830000000000004e-05, "loss": 0.3644, "step": 2617 }, { "epoch": 238.0, "grad_norm": 0.5226176381111145, "learning_rate": 2.3820000000000002e-05, "loss": 0.3302, "step": 2618 }, { "epoch": 238.0969696969697, "grad_norm": 0.2687991261482239, "learning_rate": 2.381e-05, "loss": 0.3478, "step": 2619 }, { "epoch": 238.1939393939394, "grad_norm": 0.2733170688152313, "learning_rate": 2.38e-05, "loss": 0.3572, "step": 2620 }, { "epoch": 238.1939393939394, "eval_loss": 0.4368661344051361, "eval_runtime": 2.1339, "eval_samples_per_second": 25.774, "eval_steps_per_second": 3.28, "step": 2620 }, { "epoch": 238.29090909090908, "grad_norm": 0.2491229921579361, "learning_rate": 2.379e-05, "loss": 0.3534, "step": 2621 }, { "epoch": 238.38787878787878, "grad_norm": 0.3663552701473236, "learning_rate": 2.3780000000000003e-05, "loss": 0.3496, "step": 2622 }, { "epoch": 238.4848484848485, "grad_norm": 0.28013721108436584, "learning_rate": 2.377e-05, "loss": 0.3667, "step": 2623 }, { "epoch": 238.5818181818182, "grad_norm": 0.3751683533191681, "learning_rate": 2.3760000000000003e-05, "loss": 0.3921, "step": 2624 }, { "epoch": 238.6787878787879, "grad_norm": 0.2832871675491333, "learning_rate": 2.375e-05, "loss": 0.3444, "step": 2625 }, { "epoch": 238.77575757575758, "grad_norm": 0.3930876851081848, "learning_rate": 2.374e-05, "loss": 0.3599, "step": 2626 }, { "epoch": 238.87272727272727, "grad_norm": 0.3104434013366699, "learning_rate": 2.373e-05, "loss": 0.351, "step": 2627 }, { "epoch": 238.96969696969697, "grad_norm": 0.36164671182632446, "learning_rate": 2.372e-05, "loss": 0.388, "step": 2628 }, { "epoch": 239.0, "grad_norm": 0.4879319965839386, "learning_rate": 2.371e-05, "loss": 0.362, "step": 2629 }, { "epoch": 239.0969696969697, "grad_norm": 0.32071059942245483, "learning_rate": 2.37e-05, "loss": 0.3739, "step": 2630 }, { "epoch": 239.0969696969697, "eval_loss": 0.43735435605049133, "eval_runtime": 2.1416, "eval_samples_per_second": 25.682, "eval_steps_per_second": 3.269, "step": 2630 }, { "epoch": 239.1939393939394, "grad_norm": 0.27180489897727966, "learning_rate": 2.3690000000000002e-05, "loss": 0.3268, "step": 2631 }, { "epoch": 239.29090909090908, "grad_norm": 0.44245511293411255, "learning_rate": 2.3680000000000004e-05, "loss": 0.3708, "step": 2632 }, { "epoch": 239.38787878787878, "grad_norm": 0.350221186876297, "learning_rate": 2.3670000000000002e-05, "loss": 0.3692, "step": 2633 }, { "epoch": 239.4848484848485, "grad_norm": 0.34308651089668274, "learning_rate": 2.366e-05, "loss": 0.3924, "step": 2634 }, { "epoch": 239.5818181818182, "grad_norm": 0.29386603832244873, "learning_rate": 2.365e-05, "loss": 0.3458, "step": 2635 }, { "epoch": 239.6787878787879, "grad_norm": 0.29850178956985474, "learning_rate": 2.364e-05, "loss": 0.3525, "step": 2636 }, { "epoch": 239.77575757575758, "grad_norm": 0.302652508020401, "learning_rate": 2.3630000000000002e-05, "loss": 0.3375, "step": 2637 }, { "epoch": 239.87272727272727, "grad_norm": 0.2891862392425537, "learning_rate": 2.362e-05, "loss": 0.3687, "step": 2638 }, { "epoch": 239.96969696969697, "grad_norm": 0.3072037100791931, "learning_rate": 2.3610000000000003e-05, "loss": 0.3664, "step": 2639 }, { "epoch": 240.0, "grad_norm": 0.5286477208137512, "learning_rate": 2.36e-05, "loss": 0.368, "step": 2640 }, { "epoch": 240.0, "eval_loss": 0.4361574351787567, "eval_runtime": 2.1758, "eval_samples_per_second": 25.278, "eval_steps_per_second": 3.217, "step": 2640 }, { "epoch": 240.0969696969697, "grad_norm": 0.2846533954143524, "learning_rate": 2.359e-05, "loss": 0.3299, "step": 2641 }, { "epoch": 240.1939393939394, "grad_norm": 0.282564252614975, "learning_rate": 2.358e-05, "loss": 0.3832, "step": 2642 }, { "epoch": 240.29090909090908, "grad_norm": 0.27177733182907104, "learning_rate": 2.357e-05, "loss": 0.3328, "step": 2643 }, { "epoch": 240.38787878787878, "grad_norm": 0.2859629988670349, "learning_rate": 2.356e-05, "loss": 0.3656, "step": 2644 }, { "epoch": 240.4848484848485, "grad_norm": 0.2732296884059906, "learning_rate": 2.355e-05, "loss": 0.3545, "step": 2645 }, { "epoch": 240.5818181818182, "grad_norm": 0.2838834524154663, "learning_rate": 2.354e-05, "loss": 0.331, "step": 2646 }, { "epoch": 240.6787878787879, "grad_norm": 0.3037766218185425, "learning_rate": 2.3530000000000003e-05, "loss": 0.3786, "step": 2647 }, { "epoch": 240.77575757575758, "grad_norm": 0.3013867139816284, "learning_rate": 2.3520000000000002e-05, "loss": 0.3472, "step": 2648 }, { "epoch": 240.87272727272727, "grad_norm": 0.2989252209663391, "learning_rate": 2.351e-05, "loss": 0.37, "step": 2649 }, { "epoch": 240.96969696969697, "grad_norm": 0.2740340828895569, "learning_rate": 2.35e-05, "loss": 0.3863, "step": 2650 }, { "epoch": 240.96969696969697, "eval_loss": 0.435860276222229, "eval_runtime": 2.1361, "eval_samples_per_second": 25.748, "eval_steps_per_second": 3.277, "step": 2650 }, { "epoch": 241.0, "grad_norm": 0.48094940185546875, "learning_rate": 2.349e-05, "loss": 0.4181, "step": 2651 }, { "epoch": 241.0969696969697, "grad_norm": 0.25698959827423096, "learning_rate": 2.3480000000000002e-05, "loss": 0.3279, "step": 2652 }, { "epoch": 241.1939393939394, "grad_norm": 0.27691131830215454, "learning_rate": 2.347e-05, "loss": 0.3487, "step": 2653 }, { "epoch": 241.29090909090908, "grad_norm": 0.24899674952030182, "learning_rate": 2.3460000000000002e-05, "loss": 0.3505, "step": 2654 }, { "epoch": 241.38787878787878, "grad_norm": 0.2850962281227112, "learning_rate": 2.345e-05, "loss": 0.3669, "step": 2655 }, { "epoch": 241.4848484848485, "grad_norm": 0.3749915659427643, "learning_rate": 2.344e-05, "loss": 0.3641, "step": 2656 }, { "epoch": 241.5818181818182, "grad_norm": 0.35326889157295227, "learning_rate": 2.343e-05, "loss": 0.3402, "step": 2657 }, { "epoch": 241.6787878787879, "grad_norm": 0.3130856156349182, "learning_rate": 2.342e-05, "loss": 0.3617, "step": 2658 }, { "epoch": 241.77575757575758, "grad_norm": 0.32955002784729004, "learning_rate": 2.341e-05, "loss": 0.3988, "step": 2659 }, { "epoch": 241.87272727272727, "grad_norm": 0.3192631006240845, "learning_rate": 2.3400000000000003e-05, "loss": 0.3683, "step": 2660 }, { "epoch": 241.87272727272727, "eval_loss": 0.43651309609413147, "eval_runtime": 2.1336, "eval_samples_per_second": 25.778, "eval_steps_per_second": 3.281, "step": 2660 }, { "epoch": 241.96969696969697, "grad_norm": 0.2528250515460968, "learning_rate": 2.339e-05, "loss": 0.3671, "step": 2661 }, { "epoch": 242.0, "grad_norm": 0.4293440282344818, "learning_rate": 2.3380000000000003e-05, "loss": 0.3697, "step": 2662 }, { "epoch": 242.0969696969697, "grad_norm": 0.3195413649082184, "learning_rate": 2.337e-05, "loss": 0.3752, "step": 2663 }, { "epoch": 242.1939393939394, "grad_norm": 0.25986403226852417, "learning_rate": 2.336e-05, "loss": 0.3317, "step": 2664 }, { "epoch": 242.29090909090908, "grad_norm": 0.33819544315338135, "learning_rate": 2.3350000000000002e-05, "loss": 0.3416, "step": 2665 }, { "epoch": 242.38787878787878, "grad_norm": 0.3044179379940033, "learning_rate": 2.334e-05, "loss": 0.3542, "step": 2666 }, { "epoch": 242.4848484848485, "grad_norm": 0.39956021308898926, "learning_rate": 2.3330000000000002e-05, "loss": 0.3439, "step": 2667 }, { "epoch": 242.5818181818182, "grad_norm": 0.3181312084197998, "learning_rate": 2.332e-05, "loss": 0.3304, "step": 2668 }, { "epoch": 242.6787878787879, "grad_norm": 0.2646390497684479, "learning_rate": 2.3310000000000002e-05, "loss": 0.3445, "step": 2669 }, { "epoch": 242.77575757575758, "grad_norm": 0.33343034982681274, "learning_rate": 2.3300000000000004e-05, "loss": 0.3675, "step": 2670 }, { "epoch": 242.77575757575758, "eval_loss": 0.43568772077560425, "eval_runtime": 2.176, "eval_samples_per_second": 25.275, "eval_steps_per_second": 3.217, "step": 2670 }, { "epoch": 242.87272727272727, "grad_norm": 0.300836980342865, "learning_rate": 2.3290000000000002e-05, "loss": 0.3774, "step": 2671 }, { "epoch": 242.96969696969697, "grad_norm": 0.27857717871665955, "learning_rate": 2.328e-05, "loss": 0.3975, "step": 2672 }, { "epoch": 243.0, "grad_norm": 0.605607271194458, "learning_rate": 2.327e-05, "loss": 0.4599, "step": 2673 }, { "epoch": 243.0969696969697, "grad_norm": 0.31283333897590637, "learning_rate": 2.326e-05, "loss": 0.3753, "step": 2674 }, { "epoch": 243.1939393939394, "grad_norm": 0.2685428261756897, "learning_rate": 2.3250000000000003e-05, "loss": 0.3631, "step": 2675 }, { "epoch": 243.29090909090908, "grad_norm": 0.2813699245452881, "learning_rate": 2.324e-05, "loss": 0.3413, "step": 2676 }, { "epoch": 243.38787878787878, "grad_norm": 0.26216891407966614, "learning_rate": 2.3230000000000003e-05, "loss": 0.3513, "step": 2677 }, { "epoch": 243.4848484848485, "grad_norm": 0.2780851125717163, "learning_rate": 2.322e-05, "loss": 0.3912, "step": 2678 }, { "epoch": 243.5818181818182, "grad_norm": 0.27967149019241333, "learning_rate": 2.321e-05, "loss": 0.3524, "step": 2679 }, { "epoch": 243.6787878787879, "grad_norm": 0.3546093702316284, "learning_rate": 2.32e-05, "loss": 0.3769, "step": 2680 }, { "epoch": 243.6787878787879, "eval_loss": 0.4360385835170746, "eval_runtime": 2.1281, "eval_samples_per_second": 25.844, "eval_steps_per_second": 3.289, "step": 2680 }, { "epoch": 243.77575757575758, "grad_norm": 0.3281584084033966, "learning_rate": 2.319e-05, "loss": 0.3355, "step": 2681 }, { "epoch": 243.87272727272727, "grad_norm": 0.2678728997707367, "learning_rate": 2.318e-05, "loss": 0.3333, "step": 2682 }, { "epoch": 243.96969696969697, "grad_norm": 0.33740246295928955, "learning_rate": 2.317e-05, "loss": 0.3729, "step": 2683 }, { "epoch": 244.0, "grad_norm": 0.49675431847572327, "learning_rate": 2.3160000000000002e-05, "loss": 0.3465, "step": 2684 }, { "epoch": 244.0969696969697, "grad_norm": 0.29415813088417053, "learning_rate": 2.3150000000000004e-05, "loss": 0.3735, "step": 2685 }, { "epoch": 244.1939393939394, "grad_norm": 0.2935345768928528, "learning_rate": 2.3140000000000002e-05, "loss": 0.3458, "step": 2686 }, { "epoch": 244.29090909090908, "grad_norm": 0.29703962802886963, "learning_rate": 2.313e-05, "loss": 0.3842, "step": 2687 }, { "epoch": 244.38787878787878, "grad_norm": 0.3192014694213867, "learning_rate": 2.312e-05, "loss": 0.3941, "step": 2688 }, { "epoch": 244.4848484848485, "grad_norm": 0.29394468665122986, "learning_rate": 2.311e-05, "loss": 0.3421, "step": 2689 }, { "epoch": 244.5818181818182, "grad_norm": 0.24139085412025452, "learning_rate": 2.3100000000000002e-05, "loss": 0.3456, "step": 2690 }, { "epoch": 244.5818181818182, "eval_loss": 0.4357967674732208, "eval_runtime": 2.1326, "eval_samples_per_second": 25.79, "eval_steps_per_second": 3.282, "step": 2690 }, { "epoch": 244.6787878787879, "grad_norm": 0.2768908143043518, "learning_rate": 2.309e-05, "loss": 0.3778, "step": 2691 }, { "epoch": 244.77575757575758, "grad_norm": 0.2763884663581848, "learning_rate": 2.3080000000000003e-05, "loss": 0.3425, "step": 2692 }, { "epoch": 244.87272727272727, "grad_norm": 0.310211181640625, "learning_rate": 2.307e-05, "loss": 0.3269, "step": 2693 }, { "epoch": 244.96969696969697, "grad_norm": 0.27987420558929443, "learning_rate": 2.306e-05, "loss": 0.3573, "step": 2694 }, { "epoch": 245.0, "grad_norm": 0.49357059597969055, "learning_rate": 2.305e-05, "loss": 0.3383, "step": 2695 }, { "epoch": 245.0969696969697, "grad_norm": 0.29769405722618103, "learning_rate": 2.304e-05, "loss": 0.3586, "step": 2696 }, { "epoch": 245.1939393939394, "grad_norm": 0.29475346207618713, "learning_rate": 2.303e-05, "loss": 0.3454, "step": 2697 }, { "epoch": 245.29090909090908, "grad_norm": 0.2872675359249115, "learning_rate": 2.302e-05, "loss": 0.3577, "step": 2698 }, { "epoch": 245.38787878787878, "grad_norm": 0.27988383173942566, "learning_rate": 2.301e-05, "loss": 0.3627, "step": 2699 }, { "epoch": 245.4848484848485, "grad_norm": 0.3422905206680298, "learning_rate": 2.3000000000000003e-05, "loss": 0.38, "step": 2700 }, { "epoch": 245.4848484848485, "eval_loss": 0.43530699610710144, "eval_runtime": 2.1081, "eval_samples_per_second": 26.09, "eval_steps_per_second": 3.32, "step": 2700 }, { "epoch": 245.5818181818182, "grad_norm": 0.3205651044845581, "learning_rate": 2.2990000000000002e-05, "loss": 0.3462, "step": 2701 }, { "epoch": 245.6787878787879, "grad_norm": 0.2794988751411438, "learning_rate": 2.298e-05, "loss": 0.378, "step": 2702 }, { "epoch": 245.77575757575758, "grad_norm": 0.2612413465976715, "learning_rate": 2.297e-05, "loss": 0.3647, "step": 2703 }, { "epoch": 245.87272727272727, "grad_norm": 0.2854641377925873, "learning_rate": 2.296e-05, "loss": 0.3289, "step": 2704 }, { "epoch": 245.96969696969697, "grad_norm": 0.29106345772743225, "learning_rate": 2.2950000000000002e-05, "loss": 0.349, "step": 2705 }, { "epoch": 246.0, "grad_norm": 0.5645086169242859, "learning_rate": 2.294e-05, "loss": 0.3966, "step": 2706 }, { "epoch": 246.0969696969697, "grad_norm": 0.33425596356391907, "learning_rate": 2.2930000000000002e-05, "loss": 0.351, "step": 2707 }, { "epoch": 246.1939393939394, "grad_norm": 0.2670089900493622, "learning_rate": 2.292e-05, "loss": 0.3298, "step": 2708 }, { "epoch": 246.29090909090908, "grad_norm": 0.2702706456184387, "learning_rate": 2.2910000000000003e-05, "loss": 0.3676, "step": 2709 }, { "epoch": 246.38787878787878, "grad_norm": 0.30324020981788635, "learning_rate": 2.29e-05, "loss": 0.3569, "step": 2710 }, { "epoch": 246.38787878787878, "eval_loss": 0.43549418449401855, "eval_runtime": 2.1254, "eval_samples_per_second": 25.877, "eval_steps_per_second": 3.293, "step": 2710 }, { "epoch": 246.4848484848485, "grad_norm": 0.3084642291069031, "learning_rate": 2.289e-05, "loss": 0.3725, "step": 2711 }, { "epoch": 246.5818181818182, "grad_norm": 0.28131335973739624, "learning_rate": 2.288e-05, "loss": 0.3642, "step": 2712 }, { "epoch": 246.6787878787879, "grad_norm": 0.28382378816604614, "learning_rate": 2.287e-05, "loss": 0.3469, "step": 2713 }, { "epoch": 246.77575757575758, "grad_norm": 0.3298587203025818, "learning_rate": 2.286e-05, "loss": 0.3785, "step": 2714 }, { "epoch": 246.87272727272727, "grad_norm": 0.27626752853393555, "learning_rate": 2.2850000000000003e-05, "loss": 0.3784, "step": 2715 }, { "epoch": 246.96969696969697, "grad_norm": 0.28574326634407043, "learning_rate": 2.284e-05, "loss": 0.3418, "step": 2716 }, { "epoch": 247.0, "grad_norm": 0.4947167634963989, "learning_rate": 2.283e-05, "loss": 0.3356, "step": 2717 }, { "epoch": 247.0969696969697, "grad_norm": 0.3042255640029907, "learning_rate": 2.282e-05, "loss": 0.3509, "step": 2718 }, { "epoch": 247.1939393939394, "grad_norm": 0.2821086347103119, "learning_rate": 2.281e-05, "loss": 0.3148, "step": 2719 }, { "epoch": 247.29090909090908, "grad_norm": 0.3053955137729645, "learning_rate": 2.2800000000000002e-05, "loss": 0.3743, "step": 2720 }, { "epoch": 247.29090909090908, "eval_loss": 0.43521028757095337, "eval_runtime": 2.1152, "eval_samples_per_second": 26.002, "eval_steps_per_second": 3.309, "step": 2720 }, { "epoch": 247.38787878787878, "grad_norm": 0.2838248312473297, "learning_rate": 2.279e-05, "loss": 0.3703, "step": 2721 }, { "epoch": 247.4848484848485, "grad_norm": 0.2984090745449066, "learning_rate": 2.2780000000000002e-05, "loss": 0.3624, "step": 2722 }, { "epoch": 247.5818181818182, "grad_norm": 0.25600433349609375, "learning_rate": 2.2770000000000004e-05, "loss": 0.3801, "step": 2723 }, { "epoch": 247.6787878787879, "grad_norm": 0.3806086480617523, "learning_rate": 2.2760000000000002e-05, "loss": 0.3568, "step": 2724 }, { "epoch": 247.77575757575758, "grad_norm": 0.2735169231891632, "learning_rate": 2.275e-05, "loss": 0.3429, "step": 2725 }, { "epoch": 247.87272727272727, "grad_norm": 0.2780759036540985, "learning_rate": 2.274e-05, "loss": 0.3573, "step": 2726 }, { "epoch": 247.96969696969697, "grad_norm": 0.24951213598251343, "learning_rate": 2.273e-05, "loss": 0.3584, "step": 2727 }, { "epoch": 248.0, "grad_norm": 0.4915936589241028, "learning_rate": 2.2720000000000003e-05, "loss": 0.3818, "step": 2728 }, { "epoch": 248.0969696969697, "grad_norm": 0.3129208981990814, "learning_rate": 2.271e-05, "loss": 0.4144, "step": 2729 }, { "epoch": 248.1939393939394, "grad_norm": 0.24280574917793274, "learning_rate": 2.2700000000000003e-05, "loss": 0.3706, "step": 2730 }, { "epoch": 248.1939393939394, "eval_loss": 0.43458279967308044, "eval_runtime": 2.1181, "eval_samples_per_second": 25.966, "eval_steps_per_second": 3.305, "step": 2730 }, { "epoch": 248.29090909090908, "grad_norm": 0.2954522967338562, "learning_rate": 2.269e-05, "loss": 0.3396, "step": 2731 }, { "epoch": 248.38787878787878, "grad_norm": 0.2967293858528137, "learning_rate": 2.268e-05, "loss": 0.3365, "step": 2732 }, { "epoch": 248.4848484848485, "grad_norm": 0.3011590540409088, "learning_rate": 2.267e-05, "loss": 0.3521, "step": 2733 }, { "epoch": 248.5818181818182, "grad_norm": 0.2972559928894043, "learning_rate": 2.266e-05, "loss": 0.3605, "step": 2734 }, { "epoch": 248.6787878787879, "grad_norm": 0.30652526021003723, "learning_rate": 2.265e-05, "loss": 0.334, "step": 2735 }, { "epoch": 248.77575757575758, "grad_norm": 0.2733524739742279, "learning_rate": 2.264e-05, "loss": 0.3715, "step": 2736 }, { "epoch": 248.87272727272727, "grad_norm": 0.29794397950172424, "learning_rate": 2.2630000000000002e-05, "loss": 0.3531, "step": 2737 }, { "epoch": 248.96969696969697, "grad_norm": 0.26180553436279297, "learning_rate": 2.2620000000000004e-05, "loss": 0.3529, "step": 2738 }, { "epoch": 249.0, "grad_norm": 0.3981349468231201, "learning_rate": 2.2610000000000002e-05, "loss": 0.3109, "step": 2739 }, { "epoch": 249.0969696969697, "grad_norm": 0.30066075921058655, "learning_rate": 2.26e-05, "loss": 0.3637, "step": 2740 }, { "epoch": 249.0969696969697, "eval_loss": 0.4350603520870209, "eval_runtime": 2.1392, "eval_samples_per_second": 25.711, "eval_steps_per_second": 3.272, "step": 2740 }, { "epoch": 249.1939393939394, "grad_norm": 0.2947355806827545, "learning_rate": 2.259e-05, "loss": 0.4017, "step": 2741 }, { "epoch": 249.29090909090908, "grad_norm": 0.26995307207107544, "learning_rate": 2.258e-05, "loss": 0.3106, "step": 2742 }, { "epoch": 249.38787878787878, "grad_norm": 0.24598020315170288, "learning_rate": 2.2570000000000002e-05, "loss": 0.3411, "step": 2743 }, { "epoch": 249.4848484848485, "grad_norm": 0.28865739703178406, "learning_rate": 2.256e-05, "loss": 0.3884, "step": 2744 }, { "epoch": 249.5818181818182, "grad_norm": 0.3034263253211975, "learning_rate": 2.2550000000000003e-05, "loss": 0.3637, "step": 2745 }, { "epoch": 249.6787878787879, "grad_norm": 0.30831438302993774, "learning_rate": 2.254e-05, "loss": 0.3672, "step": 2746 }, { "epoch": 249.77575757575758, "grad_norm": 0.2989354133605957, "learning_rate": 2.253e-05, "loss": 0.3489, "step": 2747 }, { "epoch": 249.87272727272727, "grad_norm": 0.2784823775291443, "learning_rate": 2.252e-05, "loss": 0.3226, "step": 2748 }, { "epoch": 249.96969696969697, "grad_norm": 0.33569571375846863, "learning_rate": 2.251e-05, "loss": 0.3456, "step": 2749 }, { "epoch": 250.0, "grad_norm": 0.5049756765365601, "learning_rate": 2.25e-05, "loss": 0.3994, "step": 2750 }, { "epoch": 250.0, "eval_loss": 0.4343967139720917, "eval_runtime": 2.123, "eval_samples_per_second": 25.906, "eval_steps_per_second": 3.297, "step": 2750 }, { "epoch": 250.0969696969697, "grad_norm": 0.3116196393966675, "learning_rate": 2.249e-05, "loss": 0.3805, "step": 2751 }, { "epoch": 250.1939393939394, "grad_norm": 0.31933361291885376, "learning_rate": 2.248e-05, "loss": 0.3548, "step": 2752 }, { "epoch": 250.29090909090908, "grad_norm": 0.3103712201118469, "learning_rate": 2.2470000000000003e-05, "loss": 0.3475, "step": 2753 }, { "epoch": 250.38787878787878, "grad_norm": 0.30291813611984253, "learning_rate": 2.2460000000000002e-05, "loss": 0.3643, "step": 2754 }, { "epoch": 250.4848484848485, "grad_norm": 0.31556010246276855, "learning_rate": 2.245e-05, "loss": 0.364, "step": 2755 }, { "epoch": 250.5818181818182, "grad_norm": 0.25011295080184937, "learning_rate": 2.244e-05, "loss": 0.3281, "step": 2756 }, { "epoch": 250.6787878787879, "grad_norm": 0.30427756905555725, "learning_rate": 2.243e-05, "loss": 0.3667, "step": 2757 }, { "epoch": 250.77575757575758, "grad_norm": 0.28063085675239563, "learning_rate": 2.2420000000000002e-05, "loss": 0.386, "step": 2758 }, { "epoch": 250.87272727272727, "grad_norm": 0.3346632719039917, "learning_rate": 2.241e-05, "loss": 0.2989, "step": 2759 }, { "epoch": 250.96969696969697, "grad_norm": 0.2454753965139389, "learning_rate": 2.2400000000000002e-05, "loss": 0.3819, "step": 2760 }, { "epoch": 250.96969696969697, "eval_loss": 0.43441030383110046, "eval_runtime": 2.1286, "eval_samples_per_second": 25.839, "eval_steps_per_second": 3.289, "step": 2760 }, { "epoch": 251.0, "grad_norm": 0.3684418499469757, "learning_rate": 2.239e-05, "loss": 0.335, "step": 2761 }, { "epoch": 251.0969696969697, "grad_norm": 0.2791552245616913, "learning_rate": 2.2380000000000003e-05, "loss": 0.3631, "step": 2762 }, { "epoch": 251.1939393939394, "grad_norm": 0.3262360394001007, "learning_rate": 2.237e-05, "loss": 0.3232, "step": 2763 }, { "epoch": 251.29090909090908, "grad_norm": 0.3078155815601349, "learning_rate": 2.236e-05, "loss": 0.3687, "step": 2764 }, { "epoch": 251.38787878787878, "grad_norm": 0.2574249804019928, "learning_rate": 2.235e-05, "loss": 0.3558, "step": 2765 }, { "epoch": 251.4848484848485, "grad_norm": 0.2514949440956116, "learning_rate": 2.234e-05, "loss": 0.3739, "step": 2766 }, { "epoch": 251.5818181818182, "grad_norm": 0.3091491460800171, "learning_rate": 2.233e-05, "loss": 0.3153, "step": 2767 }, { "epoch": 251.6787878787879, "grad_norm": 0.3235987722873688, "learning_rate": 2.2320000000000003e-05, "loss": 0.3392, "step": 2768 }, { "epoch": 251.77575757575758, "grad_norm": 0.29184114933013916, "learning_rate": 2.231e-05, "loss": 0.3874, "step": 2769 }, { "epoch": 251.87272727272727, "grad_norm": 0.28661590814590454, "learning_rate": 2.23e-05, "loss": 0.3721, "step": 2770 }, { "epoch": 251.87272727272727, "eval_loss": 0.43438997864723206, "eval_runtime": 2.1219, "eval_samples_per_second": 25.92, "eval_steps_per_second": 3.299, "step": 2770 }, { "epoch": 251.96969696969697, "grad_norm": 0.29083678126335144, "learning_rate": 2.229e-05, "loss": 0.348, "step": 2771 }, { "epoch": 252.0, "grad_norm": 0.48070859909057617, "learning_rate": 2.228e-05, "loss": 0.4007, "step": 2772 }, { "epoch": 252.0969696969697, "grad_norm": 0.2264174222946167, "learning_rate": 2.2270000000000002e-05, "loss": 0.3252, "step": 2773 }, { "epoch": 252.1939393939394, "grad_norm": 0.2599073648452759, "learning_rate": 2.226e-05, "loss": 0.3747, "step": 2774 }, { "epoch": 252.29090909090908, "grad_norm": 0.3252253830432892, "learning_rate": 2.2250000000000002e-05, "loss": 0.3797, "step": 2775 }, { "epoch": 252.38787878787878, "grad_norm": 0.33481475710868835, "learning_rate": 2.224e-05, "loss": 0.3327, "step": 2776 }, { "epoch": 252.4848484848485, "grad_norm": 0.2517988085746765, "learning_rate": 2.2230000000000002e-05, "loss": 0.3472, "step": 2777 }, { "epoch": 252.5818181818182, "grad_norm": 0.28708556294441223, "learning_rate": 2.222e-05, "loss": 0.3387, "step": 2778 }, { "epoch": 252.6787878787879, "grad_norm": 0.31490185856819153, "learning_rate": 2.221e-05, "loss": 0.3629, "step": 2779 }, { "epoch": 252.77575757575758, "grad_norm": 0.34092140197753906, "learning_rate": 2.22e-05, "loss": 0.3281, "step": 2780 }, { "epoch": 252.77575757575758, "eval_loss": 0.4341490864753723, "eval_runtime": 2.132, "eval_samples_per_second": 25.797, "eval_steps_per_second": 3.283, "step": 2780 }, { "epoch": 252.87272727272727, "grad_norm": 0.31832852959632874, "learning_rate": 2.219e-05, "loss": 0.3817, "step": 2781 }, { "epoch": 252.96969696969697, "grad_norm": 0.28311994671821594, "learning_rate": 2.218e-05, "loss": 0.3705, "step": 2782 }, { "epoch": 253.0, "grad_norm": 0.4966646432876587, "learning_rate": 2.2170000000000003e-05, "loss": 0.4161, "step": 2783 }, { "epoch": 253.0969696969697, "grad_norm": 0.2603364586830139, "learning_rate": 2.216e-05, "loss": 0.3614, "step": 2784 }, { "epoch": 253.1939393939394, "grad_norm": 0.2830662429332733, "learning_rate": 2.215e-05, "loss": 0.3586, "step": 2785 }, { "epoch": 253.29090909090908, "grad_norm": 0.32937613129615784, "learning_rate": 2.214e-05, "loss": 0.3782, "step": 2786 }, { "epoch": 253.38787878787878, "grad_norm": 0.33842575550079346, "learning_rate": 2.213e-05, "loss": 0.368, "step": 2787 }, { "epoch": 253.4848484848485, "grad_norm": 0.3619191348552704, "learning_rate": 2.212e-05, "loss": 0.357, "step": 2788 }, { "epoch": 253.5818181818182, "grad_norm": 0.27501344680786133, "learning_rate": 2.211e-05, "loss": 0.3425, "step": 2789 }, { "epoch": 253.6787878787879, "grad_norm": 0.24685457348823547, "learning_rate": 2.2100000000000002e-05, "loss": 0.3064, "step": 2790 }, { "epoch": 253.6787878787879, "eval_loss": 0.4338873028755188, "eval_runtime": 2.1219, "eval_samples_per_second": 25.921, "eval_steps_per_second": 3.299, "step": 2790 }, { "epoch": 253.77575757575758, "grad_norm": 0.3157132565975189, "learning_rate": 2.2090000000000004e-05, "loss": 0.339, "step": 2791 }, { "epoch": 253.87272727272727, "grad_norm": 0.3351724445819855, "learning_rate": 2.2080000000000002e-05, "loss": 0.3762, "step": 2792 }, { "epoch": 253.96969696969697, "grad_norm": 0.29916393756866455, "learning_rate": 2.207e-05, "loss": 0.3566, "step": 2793 }, { "epoch": 254.0, "grad_norm": 0.4145132899284363, "learning_rate": 2.206e-05, "loss": 0.3867, "step": 2794 }, { "epoch": 254.0969696969697, "grad_norm": 0.3179309368133545, "learning_rate": 2.205e-05, "loss": 0.3995, "step": 2795 }, { "epoch": 254.1939393939394, "grad_norm": 0.30933842062950134, "learning_rate": 2.2040000000000002e-05, "loss": 0.3161, "step": 2796 }, { "epoch": 254.29090909090908, "grad_norm": 0.24094711244106293, "learning_rate": 2.203e-05, "loss": 0.3451, "step": 2797 }, { "epoch": 254.38787878787878, "grad_norm": 0.34217455983161926, "learning_rate": 2.2020000000000003e-05, "loss": 0.3428, "step": 2798 }, { "epoch": 254.4848484848485, "grad_norm": 0.3295113146305084, "learning_rate": 2.201e-05, "loss": 0.3441, "step": 2799 }, { "epoch": 254.5818181818182, "grad_norm": 0.293735146522522, "learning_rate": 2.2000000000000003e-05, "loss": 0.3684, "step": 2800 }, { "epoch": 254.5818181818182, "eval_loss": 0.4341706931591034, "eval_runtime": 2.1323, "eval_samples_per_second": 25.794, "eval_steps_per_second": 3.283, "step": 2800 }, { "epoch": 254.6787878787879, "grad_norm": 0.3140919804573059, "learning_rate": 2.199e-05, "loss": 0.3704, "step": 2801 }, { "epoch": 254.77575757575758, "grad_norm": 0.3150195777416229, "learning_rate": 2.198e-05, "loss": 0.3247, "step": 2802 }, { "epoch": 254.87272727272727, "grad_norm": 0.36488816142082214, "learning_rate": 2.197e-05, "loss": 0.347, "step": 2803 }, { "epoch": 254.96969696969697, "grad_norm": 0.2660673260688782, "learning_rate": 2.196e-05, "loss": 0.3861, "step": 2804 }, { "epoch": 255.0, "grad_norm": 0.43914130330085754, "learning_rate": 2.195e-05, "loss": 0.3764, "step": 2805 }, { "epoch": 255.0969696969697, "grad_norm": 0.2545498311519623, "learning_rate": 2.1940000000000003e-05, "loss": 0.3369, "step": 2806 }, { "epoch": 255.1939393939394, "grad_norm": 0.30860552191734314, "learning_rate": 2.1930000000000002e-05, "loss": 0.3233, "step": 2807 }, { "epoch": 255.29090909090908, "grad_norm": 0.2711668610572815, "learning_rate": 2.192e-05, "loss": 0.3579, "step": 2808 }, { "epoch": 255.38787878787878, "grad_norm": 0.2572686970233917, "learning_rate": 2.191e-05, "loss": 0.365, "step": 2809 }, { "epoch": 255.4848484848485, "grad_norm": 0.2472272664308548, "learning_rate": 2.19e-05, "loss": 0.3652, "step": 2810 }, { "epoch": 255.4848484848485, "eval_loss": 0.43348821997642517, "eval_runtime": 2.127, "eval_samples_per_second": 25.858, "eval_steps_per_second": 3.291, "step": 2810 }, { "epoch": 255.5818181818182, "grad_norm": 0.31041258573532104, "learning_rate": 2.1890000000000002e-05, "loss": 0.3725, "step": 2811 }, { "epoch": 255.6787878787879, "grad_norm": 0.3136427700519562, "learning_rate": 2.188e-05, "loss": 0.3256, "step": 2812 }, { "epoch": 255.77575757575758, "grad_norm": 0.3329264521598816, "learning_rate": 2.1870000000000002e-05, "loss": 0.3629, "step": 2813 }, { "epoch": 255.87272727272727, "grad_norm": 0.43352991342544556, "learning_rate": 2.186e-05, "loss": 0.3633, "step": 2814 }, { "epoch": 255.96969696969697, "grad_norm": 0.29008254408836365, "learning_rate": 2.1850000000000003e-05, "loss": 0.3756, "step": 2815 }, { "epoch": 256.0, "grad_norm": 0.5636740326881409, "learning_rate": 2.184e-05, "loss": 0.3529, "step": 2816 }, { "epoch": 256.0969696969697, "grad_norm": 0.28044673800468445, "learning_rate": 2.183e-05, "loss": 0.3452, "step": 2817 }, { "epoch": 256.1939393939394, "grad_norm": 0.3173394501209259, "learning_rate": 2.182e-05, "loss": 0.3842, "step": 2818 }, { "epoch": 256.2909090909091, "grad_norm": 0.25740739703178406, "learning_rate": 2.181e-05, "loss": 0.3507, "step": 2819 }, { "epoch": 256.3878787878788, "grad_norm": 0.24135111272335052, "learning_rate": 2.18e-05, "loss": 0.3481, "step": 2820 }, { "epoch": 256.3878787878788, "eval_loss": 0.43362826108932495, "eval_runtime": 2.1052, "eval_samples_per_second": 26.126, "eval_steps_per_second": 3.325, "step": 2820 }, { "epoch": 256.4848484848485, "grad_norm": 0.36606621742248535, "learning_rate": 2.1790000000000003e-05, "loss": 0.3631, "step": 2821 }, { "epoch": 256.58181818181816, "grad_norm": 0.34213078022003174, "learning_rate": 2.178e-05, "loss": 0.3221, "step": 2822 }, { "epoch": 256.6787878787879, "grad_norm": 0.36660224199295044, "learning_rate": 2.177e-05, "loss": 0.3657, "step": 2823 }, { "epoch": 256.77575757575755, "grad_norm": 0.2867613136768341, "learning_rate": 2.176e-05, "loss": 0.3337, "step": 2824 }, { "epoch": 256.8727272727273, "grad_norm": 0.28834688663482666, "learning_rate": 2.175e-05, "loss": 0.3719, "step": 2825 }, { "epoch": 256.969696969697, "grad_norm": 0.28708332777023315, "learning_rate": 2.1740000000000002e-05, "loss": 0.371, "step": 2826 }, { "epoch": 257.0, "grad_norm": 0.5373139381408691, "learning_rate": 2.173e-05, "loss": 0.3228, "step": 2827 }, { "epoch": 257.0969696969697, "grad_norm": 0.33356258273124695, "learning_rate": 2.1720000000000002e-05, "loss": 0.3629, "step": 2828 }, { "epoch": 257.1939393939394, "grad_norm": 0.2881753444671631, "learning_rate": 2.171e-05, "loss": 0.3258, "step": 2829 }, { "epoch": 257.2909090909091, "grad_norm": 0.27659082412719727, "learning_rate": 2.1700000000000002e-05, "loss": 0.338, "step": 2830 }, { "epoch": 257.2909090909091, "eval_loss": 0.43408533930778503, "eval_runtime": 2.1289, "eval_samples_per_second": 25.835, "eval_steps_per_second": 3.288, "step": 2830 }, { "epoch": 257.3878787878788, "grad_norm": 0.39573410153388977, "learning_rate": 2.169e-05, "loss": 0.3303, "step": 2831 }, { "epoch": 257.4848484848485, "grad_norm": 0.3405710458755493, "learning_rate": 2.168e-05, "loss": 0.3754, "step": 2832 }, { "epoch": 257.58181818181816, "grad_norm": 0.24880316853523254, "learning_rate": 2.167e-05, "loss": 0.3733, "step": 2833 }, { "epoch": 257.6787878787879, "grad_norm": 0.2995625436306, "learning_rate": 2.166e-05, "loss": 0.3231, "step": 2834 }, { "epoch": 257.77575757575755, "grad_norm": 0.3127087354660034, "learning_rate": 2.165e-05, "loss": 0.3844, "step": 2835 }, { "epoch": 257.8727272727273, "grad_norm": 0.32488930225372314, "learning_rate": 2.1640000000000003e-05, "loss": 0.3644, "step": 2836 }, { "epoch": 257.969696969697, "grad_norm": 0.28634101152420044, "learning_rate": 2.163e-05, "loss": 0.3553, "step": 2837 }, { "epoch": 258.0, "grad_norm": 0.4247503876686096, "learning_rate": 2.162e-05, "loss": 0.3823, "step": 2838 }, { "epoch": 258.0969696969697, "grad_norm": 0.29246553778648376, "learning_rate": 2.1609999999999998e-05, "loss": 0.3658, "step": 2839 }, { "epoch": 258.1939393939394, "grad_norm": 0.32681018114089966, "learning_rate": 2.16e-05, "loss": 0.3263, "step": 2840 }, { "epoch": 258.1939393939394, "eval_loss": 0.43289804458618164, "eval_runtime": 2.1171, "eval_samples_per_second": 25.978, "eval_steps_per_second": 3.306, "step": 2840 }, { "epoch": 258.2909090909091, "grad_norm": 0.4305499196052551, "learning_rate": 2.159e-05, "loss": 0.3872, "step": 2841 }, { "epoch": 258.3878787878788, "grad_norm": 0.2609012722969055, "learning_rate": 2.158e-05, "loss": 0.333, "step": 2842 }, { "epoch": 258.4848484848485, "grad_norm": 0.3244587779045105, "learning_rate": 2.1570000000000002e-05, "loss": 0.3555, "step": 2843 }, { "epoch": 258.58181818181816, "grad_norm": 0.26547229290008545, "learning_rate": 2.1560000000000004e-05, "loss": 0.3297, "step": 2844 }, { "epoch": 258.6787878787879, "grad_norm": 0.2793107330799103, "learning_rate": 2.1550000000000002e-05, "loss": 0.3694, "step": 2845 }, { "epoch": 258.77575757575755, "grad_norm": 0.3379392921924591, "learning_rate": 2.154e-05, "loss": 0.3293, "step": 2846 }, { "epoch": 258.8727272727273, "grad_norm": 0.2868521809577942, "learning_rate": 2.153e-05, "loss": 0.3364, "step": 2847 }, { "epoch": 258.969696969697, "grad_norm": 0.3567075729370117, "learning_rate": 2.152e-05, "loss": 0.3914, "step": 2848 }, { "epoch": 259.0, "grad_norm": 0.47458121180534363, "learning_rate": 2.1510000000000002e-05, "loss": 0.4021, "step": 2849 }, { "epoch": 259.0969696969697, "grad_norm": 0.39531266689300537, "learning_rate": 2.15e-05, "loss": 0.3842, "step": 2850 }, { "epoch": 259.0969696969697, "eval_loss": 0.43324747681617737, "eval_runtime": 2.1122, "eval_samples_per_second": 26.039, "eval_steps_per_second": 3.314, "step": 2850 }, { "epoch": 259.1939393939394, "grad_norm": 0.2564777135848999, "learning_rate": 2.1490000000000003e-05, "loss": 0.3599, "step": 2851 }, { "epoch": 259.2909090909091, "grad_norm": 0.27107998728752136, "learning_rate": 2.148e-05, "loss": 0.3548, "step": 2852 }, { "epoch": 259.3878787878788, "grad_norm": 0.3113803267478943, "learning_rate": 2.1470000000000003e-05, "loss": 0.3208, "step": 2853 }, { "epoch": 259.4848484848485, "grad_norm": 0.28532272577285767, "learning_rate": 2.146e-05, "loss": 0.3793, "step": 2854 }, { "epoch": 259.58181818181816, "grad_norm": 0.30134227871894836, "learning_rate": 2.145e-05, "loss": 0.357, "step": 2855 }, { "epoch": 259.6787878787879, "grad_norm": 0.3014954626560211, "learning_rate": 2.144e-05, "loss": 0.3464, "step": 2856 }, { "epoch": 259.77575757575755, "grad_norm": 0.30127498507499695, "learning_rate": 2.143e-05, "loss": 0.2996, "step": 2857 }, { "epoch": 259.8727272727273, "grad_norm": 0.30472543835639954, "learning_rate": 2.142e-05, "loss": 0.3568, "step": 2858 }, { "epoch": 259.969696969697, "grad_norm": 0.2586653530597687, "learning_rate": 2.1410000000000003e-05, "loss": 0.3592, "step": 2859 }, { "epoch": 260.0, "grad_norm": 0.5651054978370667, "learning_rate": 2.1400000000000002e-05, "loss": 0.4092, "step": 2860 }, { "epoch": 260.0, "eval_loss": 0.43332797288894653, "eval_runtime": 2.1223, "eval_samples_per_second": 25.916, "eval_steps_per_second": 3.298, "step": 2860 }, { "epoch": 260.0969696969697, "grad_norm": 0.30095285177230835, "learning_rate": 2.139e-05, "loss": 0.3493, "step": 2861 }, { "epoch": 260.1939393939394, "grad_norm": 0.26430273056030273, "learning_rate": 2.138e-05, "loss": 0.3582, "step": 2862 }, { "epoch": 260.2909090909091, "grad_norm": 0.2739563584327698, "learning_rate": 2.137e-05, "loss": 0.3702, "step": 2863 }, { "epoch": 260.3878787878788, "grad_norm": 0.36289191246032715, "learning_rate": 2.1360000000000002e-05, "loss": 0.359, "step": 2864 }, { "epoch": 260.4848484848485, "grad_norm": 0.3343953490257263, "learning_rate": 2.135e-05, "loss": 0.3568, "step": 2865 }, { "epoch": 260.58181818181816, "grad_norm": 0.29125627875328064, "learning_rate": 2.1340000000000002e-05, "loss": 0.3117, "step": 2866 }, { "epoch": 260.6787878787879, "grad_norm": 0.4203968346118927, "learning_rate": 2.133e-05, "loss": 0.3383, "step": 2867 }, { "epoch": 260.77575757575755, "grad_norm": 0.26528501510620117, "learning_rate": 2.1320000000000003e-05, "loss": 0.3634, "step": 2868 }, { "epoch": 260.8727272727273, "grad_norm": 0.2988184988498688, "learning_rate": 2.131e-05, "loss": 0.3406, "step": 2869 }, { "epoch": 260.969696969697, "grad_norm": 0.45513859391212463, "learning_rate": 2.13e-05, "loss": 0.3663, "step": 2870 }, { "epoch": 260.969696969697, "eval_loss": 0.43273502588272095, "eval_runtime": 2.1264, "eval_samples_per_second": 25.866, "eval_steps_per_second": 3.292, "step": 2870 }, { "epoch": 261.0, "grad_norm": 0.5738509893417358, "learning_rate": 2.129e-05, "loss": 0.4098, "step": 2871 }, { "epoch": 261.0969696969697, "grad_norm": 0.32520735263824463, "learning_rate": 2.128e-05, "loss": 0.3516, "step": 2872 }, { "epoch": 261.1939393939394, "grad_norm": 0.2425633817911148, "learning_rate": 2.127e-05, "loss": 0.3491, "step": 2873 }, { "epoch": 261.2909090909091, "grad_norm": 0.27022457122802734, "learning_rate": 2.1260000000000003e-05, "loss": 0.4148, "step": 2874 }, { "epoch": 261.3878787878788, "grad_norm": 0.31663596630096436, "learning_rate": 2.125e-05, "loss": 0.3379, "step": 2875 }, { "epoch": 261.4848484848485, "grad_norm": 0.3284948468208313, "learning_rate": 2.124e-05, "loss": 0.3272, "step": 2876 }, { "epoch": 261.58181818181816, "grad_norm": 0.32757529616355896, "learning_rate": 2.123e-05, "loss": 0.3585, "step": 2877 }, { "epoch": 261.6787878787879, "grad_norm": 0.2892166078090668, "learning_rate": 2.122e-05, "loss": 0.3985, "step": 2878 }, { "epoch": 261.77575757575755, "grad_norm": 0.29307177662849426, "learning_rate": 2.1210000000000002e-05, "loss": 0.3348, "step": 2879 }, { "epoch": 261.8727272727273, "grad_norm": 0.34251949191093445, "learning_rate": 2.12e-05, "loss": 0.3241, "step": 2880 }, { "epoch": 261.8727272727273, "eval_loss": 0.4326820373535156, "eval_runtime": 2.1437, "eval_samples_per_second": 25.657, "eval_steps_per_second": 3.265, "step": 2880 }, { "epoch": 261.969696969697, "grad_norm": 0.28630900382995605, "learning_rate": 2.1190000000000002e-05, "loss": 0.3342, "step": 2881 }, { "epoch": 262.0, "grad_norm": 0.47029659152030945, "learning_rate": 2.118e-05, "loss": 0.3435, "step": 2882 }, { "epoch": 262.0969696969697, "grad_norm": 0.2874227464199066, "learning_rate": 2.1170000000000002e-05, "loss": 0.365, "step": 2883 }, { "epoch": 262.1939393939394, "grad_norm": 0.2977810502052307, "learning_rate": 2.116e-05, "loss": 0.3589, "step": 2884 }, { "epoch": 262.2909090909091, "grad_norm": 0.2793772518634796, "learning_rate": 2.115e-05, "loss": 0.38, "step": 2885 }, { "epoch": 262.3878787878788, "grad_norm": 0.2945645749568939, "learning_rate": 2.114e-05, "loss": 0.3388, "step": 2886 }, { "epoch": 262.4848484848485, "grad_norm": 0.2820453643798828, "learning_rate": 2.113e-05, "loss": 0.3481, "step": 2887 }, { "epoch": 262.58181818181816, "grad_norm": 0.30547210574150085, "learning_rate": 2.112e-05, "loss": 0.3654, "step": 2888 }, { "epoch": 262.6787878787879, "grad_norm": 0.32220014929771423, "learning_rate": 2.1110000000000003e-05, "loss": 0.347, "step": 2889 }, { "epoch": 262.77575757575755, "grad_norm": 0.28820616006851196, "learning_rate": 2.11e-05, "loss": 0.368, "step": 2890 }, { "epoch": 262.77575757575755, "eval_loss": 0.4324910044670105, "eval_runtime": 2.1073, "eval_samples_per_second": 26.1, "eval_steps_per_second": 3.322, "step": 2890 }, { "epoch": 262.8727272727273, "grad_norm": 0.3436179459095001, "learning_rate": 2.1090000000000003e-05, "loss": 0.3484, "step": 2891 }, { "epoch": 262.969696969697, "grad_norm": 0.28017234802246094, "learning_rate": 2.1079999999999998e-05, "loss": 0.3152, "step": 2892 }, { "epoch": 263.0, "grad_norm": 0.4805695116519928, "learning_rate": 2.107e-05, "loss": 0.3203, "step": 2893 }, { "epoch": 263.0969696969697, "grad_norm": 0.2792573869228363, "learning_rate": 2.106e-05, "loss": 0.3718, "step": 2894 }, { "epoch": 263.1939393939394, "grad_norm": 0.27789148688316345, "learning_rate": 2.105e-05, "loss": 0.3342, "step": 2895 }, { "epoch": 263.2909090909091, "grad_norm": 0.29735344648361206, "learning_rate": 2.1040000000000002e-05, "loss": 0.3603, "step": 2896 }, { "epoch": 263.3878787878788, "grad_norm": 0.28099167346954346, "learning_rate": 2.103e-05, "loss": 0.3775, "step": 2897 }, { "epoch": 263.4848484848485, "grad_norm": 0.2803812325000763, "learning_rate": 2.1020000000000002e-05, "loss": 0.3377, "step": 2898 }, { "epoch": 263.58181818181816, "grad_norm": 0.37615877389907837, "learning_rate": 2.101e-05, "loss": 0.3467, "step": 2899 }, { "epoch": 263.6787878787879, "grad_norm": 0.27375122904777527, "learning_rate": 2.1e-05, "loss": 0.3341, "step": 2900 }, { "epoch": 263.6787878787879, "eval_loss": 0.4324527084827423, "eval_runtime": 2.1016, "eval_samples_per_second": 26.17, "eval_steps_per_second": 3.331, "step": 2900 }, { "epoch": 263.77575757575755, "grad_norm": 0.30880647897720337, "learning_rate": 2.099e-05, "loss": 0.3396, "step": 2901 }, { "epoch": 263.8727272727273, "grad_norm": 0.32526326179504395, "learning_rate": 2.098e-05, "loss": 0.3562, "step": 2902 }, { "epoch": 263.969696969697, "grad_norm": 0.2949479818344116, "learning_rate": 2.097e-05, "loss": 0.3509, "step": 2903 }, { "epoch": 264.0, "grad_norm": 0.6180709600448608, "learning_rate": 2.0960000000000003e-05, "loss": 0.3928, "step": 2904 }, { "epoch": 264.0969696969697, "grad_norm": 0.2905566990375519, "learning_rate": 2.095e-05, "loss": 0.3352, "step": 2905 }, { "epoch": 264.1939393939394, "grad_norm": 0.28563404083251953, "learning_rate": 2.0940000000000003e-05, "loss": 0.3304, "step": 2906 }, { "epoch": 264.2909090909091, "grad_norm": 0.29980409145355225, "learning_rate": 2.093e-05, "loss": 0.408, "step": 2907 }, { "epoch": 264.3878787878788, "grad_norm": 0.2590050995349884, "learning_rate": 2.092e-05, "loss": 0.345, "step": 2908 }, { "epoch": 264.4848484848485, "grad_norm": 0.321993887424469, "learning_rate": 2.091e-05, "loss": 0.328, "step": 2909 }, { "epoch": 264.58181818181816, "grad_norm": 0.23598870635032654, "learning_rate": 2.09e-05, "loss": 0.3337, "step": 2910 }, { "epoch": 264.58181818181816, "eval_loss": 0.4328293204307556, "eval_runtime": 2.0996, "eval_samples_per_second": 26.195, "eval_steps_per_second": 3.334, "step": 2910 }, { "epoch": 264.6787878787879, "grad_norm": 0.304935485124588, "learning_rate": 2.089e-05, "loss": 0.3495, "step": 2911 }, { "epoch": 264.77575757575755, "grad_norm": 0.2994263768196106, "learning_rate": 2.0880000000000003e-05, "loss": 0.3487, "step": 2912 }, { "epoch": 264.8727272727273, "grad_norm": 0.29449424147605896, "learning_rate": 2.0870000000000002e-05, "loss": 0.3425, "step": 2913 }, { "epoch": 264.969696969697, "grad_norm": 0.3142696022987366, "learning_rate": 2.086e-05, "loss": 0.4075, "step": 2914 }, { "epoch": 265.0, "grad_norm": 0.40163326263427734, "learning_rate": 2.085e-05, "loss": 0.3102, "step": 2915 }, { "epoch": 265.0969696969697, "grad_norm": 0.30728578567504883, "learning_rate": 2.084e-05, "loss": 0.3383, "step": 2916 }, { "epoch": 265.1939393939394, "grad_norm": 0.3148532211780548, "learning_rate": 2.0830000000000002e-05, "loss": 0.3846, "step": 2917 }, { "epoch": 265.2909090909091, "grad_norm": 0.27514949440956116, "learning_rate": 2.082e-05, "loss": 0.3516, "step": 2918 }, { "epoch": 265.3878787878788, "grad_norm": 0.2847309112548828, "learning_rate": 2.0810000000000002e-05, "loss": 0.2964, "step": 2919 }, { "epoch": 265.4848484848485, "grad_norm": 0.2686462104320526, "learning_rate": 2.08e-05, "loss": 0.3298, "step": 2920 }, { "epoch": 265.4848484848485, "eval_loss": 0.43240609765052795, "eval_runtime": 2.1075, "eval_samples_per_second": 26.097, "eval_steps_per_second": 3.321, "step": 2920 }, { "epoch": 265.58181818181816, "grad_norm": 0.2931896150112152, "learning_rate": 2.0790000000000003e-05, "loss": 0.3495, "step": 2921 }, { "epoch": 265.6787878787879, "grad_norm": 0.34141311049461365, "learning_rate": 2.078e-05, "loss": 0.3247, "step": 2922 }, { "epoch": 265.77575757575755, "grad_norm": 0.3040899336338043, "learning_rate": 2.077e-05, "loss": 0.38, "step": 2923 }, { "epoch": 265.8727272727273, "grad_norm": 0.3186739981174469, "learning_rate": 2.076e-05, "loss": 0.3767, "step": 2924 }, { "epoch": 265.969696969697, "grad_norm": 0.3134538233280182, "learning_rate": 2.075e-05, "loss": 0.3834, "step": 2925 }, { "epoch": 266.0, "grad_norm": 0.44522473216056824, "learning_rate": 2.074e-05, "loss": 0.3597, "step": 2926 }, { "epoch": 266.0969696969697, "grad_norm": 0.3128274381160736, "learning_rate": 2.0730000000000003e-05, "loss": 0.3552, "step": 2927 }, { "epoch": 266.1939393939394, "grad_norm": 0.27766191959381104, "learning_rate": 2.072e-05, "loss": 0.358, "step": 2928 }, { "epoch": 266.2909090909091, "grad_norm": 0.3204481303691864, "learning_rate": 2.0710000000000003e-05, "loss": 0.3259, "step": 2929 }, { "epoch": 266.3878787878788, "grad_norm": 0.2819754183292389, "learning_rate": 2.07e-05, "loss": 0.3445, "step": 2930 }, { "epoch": 266.3878787878788, "eval_loss": 0.43232443928718567, "eval_runtime": 2.1158, "eval_samples_per_second": 25.995, "eval_steps_per_second": 3.308, "step": 2930 }, { "epoch": 266.4848484848485, "grad_norm": 0.33887726068496704, "learning_rate": 2.069e-05, "loss": 0.3622, "step": 2931 }, { "epoch": 266.58181818181816, "grad_norm": 0.27619192004203796, "learning_rate": 2.0680000000000002e-05, "loss": 0.3591, "step": 2932 }, { "epoch": 266.6787878787879, "grad_norm": 0.2761163115501404, "learning_rate": 2.067e-05, "loss": 0.3481, "step": 2933 }, { "epoch": 266.77575757575755, "grad_norm": 0.2995469272136688, "learning_rate": 2.0660000000000002e-05, "loss": 0.3578, "step": 2934 }, { "epoch": 266.8727272727273, "grad_norm": 0.26977553963661194, "learning_rate": 2.065e-05, "loss": 0.3536, "step": 2935 }, { "epoch": 266.969696969697, "grad_norm": 0.2937801480293274, "learning_rate": 2.0640000000000002e-05, "loss": 0.3335, "step": 2936 }, { "epoch": 267.0, "grad_norm": 0.5464254021644592, "learning_rate": 2.063e-05, "loss": 0.4013, "step": 2937 }, { "epoch": 267.0969696969697, "grad_norm": 0.2871507704257965, "learning_rate": 2.062e-05, "loss": 0.3432, "step": 2938 }, { "epoch": 267.1939393939394, "grad_norm": 0.3833047151565552, "learning_rate": 2.061e-05, "loss": 0.3692, "step": 2939 }, { "epoch": 267.2909090909091, "grad_norm": 0.2943490445613861, "learning_rate": 2.06e-05, "loss": 0.3036, "step": 2940 }, { "epoch": 267.2909090909091, "eval_loss": 0.43172433972358704, "eval_runtime": 2.1026, "eval_samples_per_second": 26.158, "eval_steps_per_second": 3.329, "step": 2940 }, { "epoch": 267.3878787878788, "grad_norm": 0.2477860301733017, "learning_rate": 2.059e-05, "loss": 0.372, "step": 2941 }, { "epoch": 267.4848484848485, "grad_norm": 0.2954455018043518, "learning_rate": 2.0580000000000003e-05, "loss": 0.3343, "step": 2942 }, { "epoch": 267.58181818181816, "grad_norm": 0.2852739989757538, "learning_rate": 2.057e-05, "loss": 0.329, "step": 2943 }, { "epoch": 267.6787878787879, "grad_norm": 0.3311695158481598, "learning_rate": 2.0560000000000003e-05, "loss": 0.3626, "step": 2944 }, { "epoch": 267.77575757575755, "grad_norm": 0.3276229798793793, "learning_rate": 2.055e-05, "loss": 0.3642, "step": 2945 }, { "epoch": 267.8727272727273, "grad_norm": 0.35247862339019775, "learning_rate": 2.054e-05, "loss": 0.3787, "step": 2946 }, { "epoch": 267.969696969697, "grad_norm": 0.24730032682418823, "learning_rate": 2.053e-05, "loss": 0.3486, "step": 2947 }, { "epoch": 268.0, "grad_norm": 0.4214022755622864, "learning_rate": 2.052e-05, "loss": 0.3604, "step": 2948 }, { "epoch": 268.0969696969697, "grad_norm": 0.28409886360168457, "learning_rate": 2.0510000000000002e-05, "loss": 0.3571, "step": 2949 }, { "epoch": 268.1939393939394, "grad_norm": 0.27273720502853394, "learning_rate": 2.05e-05, "loss": 0.3489, "step": 2950 }, { "epoch": 268.1939393939394, "eval_loss": 0.4321860373020172, "eval_runtime": 2.1649, "eval_samples_per_second": 25.405, "eval_steps_per_second": 3.233, "step": 2950 }, { "epoch": 268.2909090909091, "grad_norm": 0.3224973976612091, "learning_rate": 2.0490000000000002e-05, "loss": 0.3593, "step": 2951 }, { "epoch": 268.3878787878788, "grad_norm": 0.29232099652290344, "learning_rate": 2.048e-05, "loss": 0.3588, "step": 2952 }, { "epoch": 268.4848484848485, "grad_norm": 0.30897924304008484, "learning_rate": 2.047e-05, "loss": 0.3953, "step": 2953 }, { "epoch": 268.58181818181816, "grad_norm": 0.3023488223552704, "learning_rate": 2.046e-05, "loss": 0.3967, "step": 2954 }, { "epoch": 268.6787878787879, "grad_norm": 0.2875419557094574, "learning_rate": 2.045e-05, "loss": 0.3137, "step": 2955 }, { "epoch": 268.77575757575755, "grad_norm": 0.30910709500312805, "learning_rate": 2.044e-05, "loss": 0.3439, "step": 2956 }, { "epoch": 268.8727272727273, "grad_norm": 0.3412000834941864, "learning_rate": 2.0430000000000003e-05, "loss": 0.3213, "step": 2957 }, { "epoch": 268.969696969697, "grad_norm": 0.27936646342277527, "learning_rate": 2.042e-05, "loss": 0.3279, "step": 2958 }, { "epoch": 269.0, "grad_norm": 0.4301643371582031, "learning_rate": 2.0410000000000003e-05, "loss": 0.2894, "step": 2959 }, { "epoch": 269.0969696969697, "grad_norm": 0.26649555563926697, "learning_rate": 2.04e-05, "loss": 0.3296, "step": 2960 }, { "epoch": 269.0969696969697, "eval_loss": 0.43194034695625305, "eval_runtime": 2.1128, "eval_samples_per_second": 26.031, "eval_steps_per_second": 3.313, "step": 2960 }, { "epoch": 269.1939393939394, "grad_norm": 0.28858742117881775, "learning_rate": 2.039e-05, "loss": 0.3605, "step": 2961 }, { "epoch": 269.2909090909091, "grad_norm": 0.3377971053123474, "learning_rate": 2.038e-05, "loss": 0.3219, "step": 2962 }, { "epoch": 269.3878787878788, "grad_norm": 0.3358132839202881, "learning_rate": 2.037e-05, "loss": 0.343, "step": 2963 }, { "epoch": 269.4848484848485, "grad_norm": 0.3058205544948578, "learning_rate": 2.036e-05, "loss": 0.3609, "step": 2964 }, { "epoch": 269.58181818181816, "grad_norm": 0.3367607295513153, "learning_rate": 2.035e-05, "loss": 0.347, "step": 2965 }, { "epoch": 269.6787878787879, "grad_norm": 0.33359044790267944, "learning_rate": 2.0340000000000002e-05, "loss": 0.3899, "step": 2966 }, { "epoch": 269.77575757575755, "grad_norm": 0.28831222653388977, "learning_rate": 2.033e-05, "loss": 0.3559, "step": 2967 }, { "epoch": 269.8727272727273, "grad_norm": 0.24753743410110474, "learning_rate": 2.032e-05, "loss": 0.3609, "step": 2968 }, { "epoch": 269.969696969697, "grad_norm": 0.3068210482597351, "learning_rate": 2.031e-05, "loss": 0.3495, "step": 2969 }, { "epoch": 270.0, "grad_norm": 0.5752652883529663, "learning_rate": 2.0300000000000002e-05, "loss": 0.3073, "step": 2970 }, { "epoch": 270.0, "eval_loss": 0.43105173110961914, "eval_runtime": 2.1156, "eval_samples_per_second": 25.997, "eval_steps_per_second": 3.309, "step": 2970 }, { "epoch": 270.0969696969697, "grad_norm": 0.289946973323822, "learning_rate": 2.029e-05, "loss": 0.3672, "step": 2971 }, { "epoch": 270.1939393939394, "grad_norm": 0.2912636399269104, "learning_rate": 2.0280000000000002e-05, "loss": 0.3491, "step": 2972 }, { "epoch": 270.2909090909091, "grad_norm": 0.4258723258972168, "learning_rate": 2.027e-05, "loss": 0.3439, "step": 2973 }, { "epoch": 270.3878787878788, "grad_norm": 0.24976497888565063, "learning_rate": 2.0260000000000003e-05, "loss": 0.3305, "step": 2974 }, { "epoch": 270.4848484848485, "grad_norm": 0.3167341351509094, "learning_rate": 2.025e-05, "loss": 0.3744, "step": 2975 }, { "epoch": 270.58181818181816, "grad_norm": 0.3470234274864197, "learning_rate": 2.024e-05, "loss": 0.3286, "step": 2976 }, { "epoch": 270.6787878787879, "grad_norm": 0.30521225929260254, "learning_rate": 2.023e-05, "loss": 0.3427, "step": 2977 }, { "epoch": 270.77575757575755, "grad_norm": 0.3303682804107666, "learning_rate": 2.022e-05, "loss": 0.371, "step": 2978 }, { "epoch": 270.8727272727273, "grad_norm": 0.31925997138023376, "learning_rate": 2.021e-05, "loss": 0.3558, "step": 2979 }, { "epoch": 270.969696969697, "grad_norm": 0.3036620616912842, "learning_rate": 2.0200000000000003e-05, "loss": 0.3245, "step": 2980 }, { "epoch": 270.969696969697, "eval_loss": 0.4314921498298645, "eval_runtime": 2.1128, "eval_samples_per_second": 26.032, "eval_steps_per_second": 3.313, "step": 2980 }, { "epoch": 271.0, "grad_norm": 0.5975680351257324, "learning_rate": 2.019e-05, "loss": 0.3874, "step": 2981 }, { "epoch": 271.0969696969697, "grad_norm": 0.2873639464378357, "learning_rate": 2.0180000000000003e-05, "loss": 0.3234, "step": 2982 }, { "epoch": 271.1939393939394, "grad_norm": 0.2703818082809448, "learning_rate": 2.017e-05, "loss": 0.345, "step": 2983 }, { "epoch": 271.2909090909091, "grad_norm": 0.3004109561443329, "learning_rate": 2.016e-05, "loss": 0.3726, "step": 2984 }, { "epoch": 271.3878787878788, "grad_norm": 0.29286929965019226, "learning_rate": 2.0150000000000002e-05, "loss": 0.3264, "step": 2985 }, { "epoch": 271.4848484848485, "grad_norm": 0.3061460554599762, "learning_rate": 2.014e-05, "loss": 0.338, "step": 2986 }, { "epoch": 271.58181818181816, "grad_norm": 0.3210451304912567, "learning_rate": 2.0130000000000002e-05, "loss": 0.3491, "step": 2987 }, { "epoch": 271.6787878787879, "grad_norm": 0.3391363024711609, "learning_rate": 2.012e-05, "loss": 0.3642, "step": 2988 }, { "epoch": 271.77575757575755, "grad_norm": 0.3371747136116028, "learning_rate": 2.0110000000000002e-05, "loss": 0.3192, "step": 2989 }, { "epoch": 271.8727272727273, "grad_norm": 0.29019173979759216, "learning_rate": 2.01e-05, "loss": 0.3577, "step": 2990 }, { "epoch": 271.8727272727273, "eval_loss": 0.43195751309394836, "eval_runtime": 2.1447, "eval_samples_per_second": 25.645, "eval_steps_per_second": 3.264, "step": 2990 }, { "epoch": 271.969696969697, "grad_norm": 0.3430919945240021, "learning_rate": 2.009e-05, "loss": 0.3814, "step": 2991 }, { "epoch": 272.0, "grad_norm": 0.6806743144989014, "learning_rate": 2.008e-05, "loss": 0.4202, "step": 2992 }, { "epoch": 272.0969696969697, "grad_norm": 0.323353111743927, "learning_rate": 2.007e-05, "loss": 0.3634, "step": 2993 }, { "epoch": 272.1939393939394, "grad_norm": 0.3096074163913727, "learning_rate": 2.006e-05, "loss": 0.3412, "step": 2994 }, { "epoch": 272.2909090909091, "grad_norm": 0.2603781819343567, "learning_rate": 2.0050000000000003e-05, "loss": 0.3313, "step": 2995 }, { "epoch": 272.3878787878788, "grad_norm": 0.38184812664985657, "learning_rate": 2.004e-05, "loss": 0.3773, "step": 2996 }, { "epoch": 272.4848484848485, "grad_norm": 0.28190943598747253, "learning_rate": 2.0030000000000003e-05, "loss": 0.3858, "step": 2997 }, { "epoch": 272.58181818181816, "grad_norm": 0.29366445541381836, "learning_rate": 2.002e-05, "loss": 0.3654, "step": 2998 }, { "epoch": 272.6787878787879, "grad_norm": 0.3181076645851135, "learning_rate": 2.001e-05, "loss": 0.3091, "step": 2999 }, { "epoch": 272.77575757575755, "grad_norm": 0.31197789311408997, "learning_rate": 2e-05, "loss": 0.3291, "step": 3000 }, { "epoch": 272.77575757575755, "eval_loss": 0.43125858902931213, "eval_runtime": 2.1098, "eval_samples_per_second": 26.068, "eval_steps_per_second": 3.318, "step": 3000 }, { "epoch": 272.8727272727273, "grad_norm": 0.27228426933288574, "learning_rate": 1.999e-05, "loss": 0.314, "step": 3001 }, { "epoch": 272.969696969697, "grad_norm": 0.27844688296318054, "learning_rate": 1.9980000000000002e-05, "loss": 0.381, "step": 3002 }, { "epoch": 273.0, "grad_norm": 0.5656706094741821, "learning_rate": 1.997e-05, "loss": 0.3438, "step": 3003 }, { "epoch": 273.0969696969697, "grad_norm": 0.3047666549682617, "learning_rate": 1.9960000000000002e-05, "loss": 0.3674, "step": 3004 }, { "epoch": 273.1939393939394, "grad_norm": 0.2743180990219116, "learning_rate": 1.995e-05, "loss": 0.3293, "step": 3005 }, { "epoch": 273.2909090909091, "grad_norm": 0.3302759826183319, "learning_rate": 1.994e-05, "loss": 0.3426, "step": 3006 }, { "epoch": 273.3878787878788, "grad_norm": 0.3315967321395874, "learning_rate": 1.993e-05, "loss": 0.3581, "step": 3007 }, { "epoch": 273.4848484848485, "grad_norm": 0.3116733133792877, "learning_rate": 1.992e-05, "loss": 0.3407, "step": 3008 }, { "epoch": 273.58181818181816, "grad_norm": 0.27811938524246216, "learning_rate": 1.991e-05, "loss": 0.3487, "step": 3009 }, { "epoch": 273.6787878787879, "grad_norm": 0.29493939876556396, "learning_rate": 1.9900000000000003e-05, "loss": 0.3717, "step": 3010 }, { "epoch": 273.6787878787879, "eval_loss": 0.4310474693775177, "eval_runtime": 2.1139, "eval_samples_per_second": 26.018, "eval_steps_per_second": 3.311, "step": 3010 }, { "epoch": 273.77575757575755, "grad_norm": 0.2869589626789093, "learning_rate": 1.989e-05, "loss": 0.3577, "step": 3011 }, { "epoch": 273.8727272727273, "grad_norm": 0.32067158818244934, "learning_rate": 1.9880000000000003e-05, "loss": 0.3186, "step": 3012 }, { "epoch": 273.969696969697, "grad_norm": 0.3775586485862732, "learning_rate": 1.987e-05, "loss": 0.3697, "step": 3013 }, { "epoch": 274.0, "grad_norm": 0.4237777888774872, "learning_rate": 1.986e-05, "loss": 0.313, "step": 3014 }, { "epoch": 274.0969696969697, "grad_norm": 0.2924163341522217, "learning_rate": 1.985e-05, "loss": 0.3671, "step": 3015 }, { "epoch": 274.1939393939394, "grad_norm": 0.29473182559013367, "learning_rate": 1.984e-05, "loss": 0.376, "step": 3016 }, { "epoch": 274.2909090909091, "grad_norm": 0.35198548436164856, "learning_rate": 1.983e-05, "loss": 0.3578, "step": 3017 }, { "epoch": 274.3878787878788, "grad_norm": 0.2765541672706604, "learning_rate": 1.982e-05, "loss": 0.3727, "step": 3018 }, { "epoch": 274.4848484848485, "grad_norm": 0.3288250267505646, "learning_rate": 1.9810000000000002e-05, "loss": 0.329, "step": 3019 }, { "epoch": 274.58181818181816, "grad_norm": 0.3038225769996643, "learning_rate": 1.9800000000000004e-05, "loss": 0.3089, "step": 3020 }, { "epoch": 274.58181818181816, "eval_loss": 0.43139633536338806, "eval_runtime": 2.1041, "eval_samples_per_second": 26.139, "eval_steps_per_second": 3.327, "step": 3020 }, { "epoch": 274.6787878787879, "grad_norm": 0.28620851039886475, "learning_rate": 1.979e-05, "loss": 0.3792, "step": 3021 }, { "epoch": 274.77575757575755, "grad_norm": 0.24123451113700867, "learning_rate": 1.978e-05, "loss": 0.3362, "step": 3022 }, { "epoch": 274.8727272727273, "grad_norm": 0.2994822859764099, "learning_rate": 1.977e-05, "loss": 0.3286, "step": 3023 }, { "epoch": 274.969696969697, "grad_norm": 0.2507471442222595, "learning_rate": 1.976e-05, "loss": 0.3346, "step": 3024 }, { "epoch": 275.0, "grad_norm": 0.49847498536109924, "learning_rate": 1.9750000000000002e-05, "loss": 0.3411, "step": 3025 }, { "epoch": 275.0969696969697, "grad_norm": 0.30484265089035034, "learning_rate": 1.974e-05, "loss": 0.3269, "step": 3026 }, { "epoch": 275.1939393939394, "grad_norm": 0.3385094702243805, "learning_rate": 1.9730000000000003e-05, "loss": 0.3751, "step": 3027 }, { "epoch": 275.2909090909091, "grad_norm": 0.27394574880599976, "learning_rate": 1.972e-05, "loss": 0.3457, "step": 3028 }, { "epoch": 275.3878787878788, "grad_norm": 0.3241487443447113, "learning_rate": 1.971e-05, "loss": 0.3625, "step": 3029 }, { "epoch": 275.4848484848485, "grad_norm": 0.2895892262458801, "learning_rate": 1.97e-05, "loss": 0.3576, "step": 3030 }, { "epoch": 275.4848484848485, "eval_loss": 0.43097203969955444, "eval_runtime": 2.1081, "eval_samples_per_second": 26.09, "eval_steps_per_second": 3.321, "step": 3030 }, { "epoch": 275.58181818181816, "grad_norm": 0.3027297556400299, "learning_rate": 1.969e-05, "loss": 0.3301, "step": 3031 }, { "epoch": 275.6787878787879, "grad_norm": 0.34131574630737305, "learning_rate": 1.968e-05, "loss": 0.3459, "step": 3032 }, { "epoch": 275.77575757575755, "grad_norm": 0.2701811194419861, "learning_rate": 1.9670000000000003e-05, "loss": 0.345, "step": 3033 }, { "epoch": 275.8727272727273, "grad_norm": 0.337098628282547, "learning_rate": 1.966e-05, "loss": 0.3745, "step": 3034 }, { "epoch": 275.969696969697, "grad_norm": 0.3441219627857208, "learning_rate": 1.9650000000000003e-05, "loss": 0.3189, "step": 3035 }, { "epoch": 276.0, "grad_norm": 0.717963457107544, "learning_rate": 1.9640000000000002e-05, "loss": 0.3628, "step": 3036 }, { "epoch": 276.0969696969697, "grad_norm": 0.27845311164855957, "learning_rate": 1.963e-05, "loss": 0.3544, "step": 3037 }, { "epoch": 276.1939393939394, "grad_norm": 0.36221420764923096, "learning_rate": 1.9620000000000002e-05, "loss": 0.3806, "step": 3038 }, { "epoch": 276.2909090909091, "grad_norm": 0.34527480602264404, "learning_rate": 1.961e-05, "loss": 0.3743, "step": 3039 }, { "epoch": 276.3878787878788, "grad_norm": 0.3114314675331116, "learning_rate": 1.9600000000000002e-05, "loss": 0.3565, "step": 3040 }, { "epoch": 276.3878787878788, "eval_loss": 0.43109825253486633, "eval_runtime": 2.131, "eval_samples_per_second": 25.81, "eval_steps_per_second": 3.285, "step": 3040 }, { "epoch": 276.4848484848485, "grad_norm": 0.3695688843727112, "learning_rate": 1.959e-05, "loss": 0.3316, "step": 3041 }, { "epoch": 276.58181818181816, "grad_norm": 0.29446908831596375, "learning_rate": 1.9580000000000002e-05, "loss": 0.3476, "step": 3042 }, { "epoch": 276.6787878787879, "grad_norm": 0.30297887325286865, "learning_rate": 1.957e-05, "loss": 0.3114, "step": 3043 }, { "epoch": 276.77575757575755, "grad_norm": 0.30303916335105896, "learning_rate": 1.956e-05, "loss": 0.3298, "step": 3044 }, { "epoch": 276.8727272727273, "grad_norm": 0.28460606932640076, "learning_rate": 1.955e-05, "loss": 0.3494, "step": 3045 }, { "epoch": 276.969696969697, "grad_norm": 0.3434138596057892, "learning_rate": 1.954e-05, "loss": 0.3583, "step": 3046 }, { "epoch": 277.0, "grad_norm": 0.4618801474571228, "learning_rate": 1.953e-05, "loss": 0.3074, "step": 3047 }, { "epoch": 277.0969696969697, "grad_norm": 0.32681795954704285, "learning_rate": 1.9520000000000003e-05, "loss": 0.318, "step": 3048 }, { "epoch": 277.1939393939394, "grad_norm": 0.2953447699546814, "learning_rate": 1.951e-05, "loss": 0.374, "step": 3049 }, { "epoch": 277.2909090909091, "grad_norm": 0.28921085596084595, "learning_rate": 1.9500000000000003e-05, "loss": 0.3511, "step": 3050 }, { "epoch": 277.2909090909091, "eval_loss": 0.4307308495044708, "eval_runtime": 2.1197, "eval_samples_per_second": 25.947, "eval_steps_per_second": 3.302, "step": 3050 }, { "epoch": 277.3878787878788, "grad_norm": 0.3594936430454254, "learning_rate": 1.949e-05, "loss": 0.3635, "step": 3051 }, { "epoch": 277.4848484848485, "grad_norm": 0.34557226300239563, "learning_rate": 1.948e-05, "loss": 0.3457, "step": 3052 }, { "epoch": 277.58181818181816, "grad_norm": 0.2568785548210144, "learning_rate": 1.947e-05, "loss": 0.3378, "step": 3053 }, { "epoch": 277.6787878787879, "grad_norm": 0.27184462547302246, "learning_rate": 1.946e-05, "loss": 0.3379, "step": 3054 }, { "epoch": 277.77575757575755, "grad_norm": 0.29772868752479553, "learning_rate": 1.9450000000000002e-05, "loss": 0.3628, "step": 3055 }, { "epoch": 277.8727272727273, "grad_norm": 0.35863181948661804, "learning_rate": 1.944e-05, "loss": 0.3368, "step": 3056 }, { "epoch": 277.969696969697, "grad_norm": 0.28443828225135803, "learning_rate": 1.9430000000000002e-05, "loss": 0.3488, "step": 3057 }, { "epoch": 278.0, "grad_norm": 0.4689065217971802, "learning_rate": 1.942e-05, "loss": 0.3654, "step": 3058 }, { "epoch": 278.0969696969697, "grad_norm": 0.27923905849456787, "learning_rate": 1.941e-05, "loss": 0.3318, "step": 3059 }, { "epoch": 278.1939393939394, "grad_norm": 0.28366586565971375, "learning_rate": 1.94e-05, "loss": 0.381, "step": 3060 }, { "epoch": 278.1939393939394, "eval_loss": 0.43080776929855347, "eval_runtime": 2.106, "eval_samples_per_second": 26.116, "eval_steps_per_second": 3.324, "step": 3060 }, { "epoch": 278.2909090909091, "grad_norm": 0.29033416509628296, "learning_rate": 1.939e-05, "loss": 0.2994, "step": 3061 }, { "epoch": 278.3878787878788, "grad_norm": 0.29525431990623474, "learning_rate": 1.938e-05, "loss": 0.3606, "step": 3062 }, { "epoch": 278.4848484848485, "grad_norm": 0.29698970913887024, "learning_rate": 1.9370000000000003e-05, "loss": 0.3392, "step": 3063 }, { "epoch": 278.58181818181816, "grad_norm": 0.27126365900039673, "learning_rate": 1.936e-05, "loss": 0.3429, "step": 3064 }, { "epoch": 278.6787878787879, "grad_norm": 0.3190431594848633, "learning_rate": 1.9350000000000003e-05, "loss": 0.3631, "step": 3065 }, { "epoch": 278.77575757575755, "grad_norm": 0.3183770179748535, "learning_rate": 1.934e-05, "loss": 0.3582, "step": 3066 }, { "epoch": 278.8727272727273, "grad_norm": 0.28862932324409485, "learning_rate": 1.933e-05, "loss": 0.3427, "step": 3067 }, { "epoch": 278.969696969697, "grad_norm": 0.31062668561935425, "learning_rate": 1.932e-05, "loss": 0.3639, "step": 3068 }, { "epoch": 279.0, "grad_norm": 0.3792564570903778, "learning_rate": 1.931e-05, "loss": 0.3284, "step": 3069 }, { "epoch": 279.0969696969697, "grad_norm": 0.26903533935546875, "learning_rate": 1.93e-05, "loss": 0.3243, "step": 3070 }, { "epoch": 279.0969696969697, "eval_loss": 0.4307677447795868, "eval_runtime": 2.0965, "eval_samples_per_second": 26.234, "eval_steps_per_second": 3.339, "step": 3070 }, { "epoch": 279.1939393939394, "grad_norm": 0.28425025939941406, "learning_rate": 1.929e-05, "loss": 0.3522, "step": 3071 }, { "epoch": 279.2909090909091, "grad_norm": 0.2826736867427826, "learning_rate": 1.9280000000000002e-05, "loss": 0.3038, "step": 3072 }, { "epoch": 279.3878787878788, "grad_norm": 0.3037801682949066, "learning_rate": 1.9270000000000004e-05, "loss": 0.3718, "step": 3073 }, { "epoch": 279.4848484848485, "grad_norm": 0.319252610206604, "learning_rate": 1.9260000000000002e-05, "loss": 0.3388, "step": 3074 }, { "epoch": 279.58181818181816, "grad_norm": 0.29738590121269226, "learning_rate": 1.925e-05, "loss": 0.3716, "step": 3075 }, { "epoch": 279.6787878787879, "grad_norm": 0.28336021304130554, "learning_rate": 1.924e-05, "loss": 0.3868, "step": 3076 }, { "epoch": 279.77575757575755, "grad_norm": 0.29788967967033386, "learning_rate": 1.923e-05, "loss": 0.3595, "step": 3077 }, { "epoch": 279.8727272727273, "grad_norm": 0.2871606647968292, "learning_rate": 1.9220000000000002e-05, "loss": 0.3453, "step": 3078 }, { "epoch": 279.969696969697, "grad_norm": 0.32829487323760986, "learning_rate": 1.921e-05, "loss": 0.3364, "step": 3079 }, { "epoch": 280.0, "grad_norm": 0.43252795934677124, "learning_rate": 1.9200000000000003e-05, "loss": 0.2944, "step": 3080 }, { "epoch": 280.0, "eval_loss": 0.4305260479450226, "eval_runtime": 2.1118, "eval_samples_per_second": 26.044, "eval_steps_per_second": 3.315, "step": 3080 }, { "epoch": 280.0969696969697, "grad_norm": 0.28369900584220886, "learning_rate": 1.919e-05, "loss": 0.3107, "step": 3081 }, { "epoch": 280.1939393939394, "grad_norm": 0.2864631116390228, "learning_rate": 1.918e-05, "loss": 0.3431, "step": 3082 }, { "epoch": 280.2909090909091, "grad_norm": 0.2793765962123871, "learning_rate": 1.917e-05, "loss": 0.3503, "step": 3083 }, { "epoch": 280.3878787878788, "grad_norm": 0.2945725619792938, "learning_rate": 1.916e-05, "loss": 0.355, "step": 3084 }, { "epoch": 280.4848484848485, "grad_norm": 0.29189518094062805, "learning_rate": 1.915e-05, "loss": 0.3404, "step": 3085 }, { "epoch": 280.58181818181816, "grad_norm": 0.3202930986881256, "learning_rate": 1.914e-05, "loss": 0.3533, "step": 3086 }, { "epoch": 280.6787878787879, "grad_norm": 0.3237863779067993, "learning_rate": 1.913e-05, "loss": 0.3396, "step": 3087 }, { "epoch": 280.77575757575755, "grad_norm": 0.42770758271217346, "learning_rate": 1.9120000000000003e-05, "loss": 0.3605, "step": 3088 }, { "epoch": 280.8727272727273, "grad_norm": 0.25378715991973877, "learning_rate": 1.911e-05, "loss": 0.3603, "step": 3089 }, { "epoch": 280.969696969697, "grad_norm": 0.28680214285850525, "learning_rate": 1.91e-05, "loss": 0.3551, "step": 3090 }, { "epoch": 280.969696969697, "eval_loss": 0.4307253956794739, "eval_runtime": 2.108, "eval_samples_per_second": 26.092, "eval_steps_per_second": 3.321, "step": 3090 }, { "epoch": 281.0, "grad_norm": 0.5535325407981873, "learning_rate": 1.909e-05, "loss": 0.3546, "step": 3091 }, { "epoch": 281.0969696969697, "grad_norm": 0.27855050563812256, "learning_rate": 1.908e-05, "loss": 0.3554, "step": 3092 }, { "epoch": 281.1939393939394, "grad_norm": 0.29888078570365906, "learning_rate": 1.9070000000000002e-05, "loss": 0.3379, "step": 3093 }, { "epoch": 281.2909090909091, "grad_norm": 0.28423425555229187, "learning_rate": 1.906e-05, "loss": 0.372, "step": 3094 }, { "epoch": 281.3878787878788, "grad_norm": 0.2922607958316803, "learning_rate": 1.9050000000000002e-05, "loss": 0.347, "step": 3095 }, { "epoch": 281.4848484848485, "grad_norm": 0.2930501699447632, "learning_rate": 1.904e-05, "loss": 0.3176, "step": 3096 }, { "epoch": 281.58181818181816, "grad_norm": 0.32645270228385925, "learning_rate": 1.903e-05, "loss": 0.3394, "step": 3097 }, { "epoch": 281.6787878787879, "grad_norm": 0.28562331199645996, "learning_rate": 1.902e-05, "loss": 0.3122, "step": 3098 }, { "epoch": 281.77575757575755, "grad_norm": 0.3255389630794525, "learning_rate": 1.901e-05, "loss": 0.3787, "step": 3099 }, { "epoch": 281.8727272727273, "grad_norm": 0.29790711402893066, "learning_rate": 1.9e-05, "loss": 0.3318, "step": 3100 }, { "epoch": 281.8727272727273, "eval_loss": 0.43041619658470154, "eval_runtime": 2.0976, "eval_samples_per_second": 26.22, "eval_steps_per_second": 3.337, "step": 3100 }, { "epoch": 281.969696969697, "grad_norm": 0.29323795437812805, "learning_rate": 1.8990000000000003e-05, "loss": 0.372, "step": 3101 }, { "epoch": 282.0, "grad_norm": 0.5759214162826538, "learning_rate": 1.898e-05, "loss": 0.3692, "step": 3102 }, { "epoch": 282.0969696969697, "grad_norm": 0.26442188024520874, "learning_rate": 1.8970000000000003e-05, "loss": 0.3629, "step": 3103 }, { "epoch": 282.1939393939394, "grad_norm": 0.34483394026756287, "learning_rate": 1.896e-05, "loss": 0.3764, "step": 3104 }, { "epoch": 282.2909090909091, "grad_norm": 0.3741251230239868, "learning_rate": 1.895e-05, "loss": 0.3287, "step": 3105 }, { "epoch": 282.3878787878788, "grad_norm": 0.28175756335258484, "learning_rate": 1.894e-05, "loss": 0.3671, "step": 3106 }, { "epoch": 282.4848484848485, "grad_norm": 0.3492465019226074, "learning_rate": 1.893e-05, "loss": 0.3595, "step": 3107 }, { "epoch": 282.58181818181816, "grad_norm": 0.2611752450466156, "learning_rate": 1.8920000000000002e-05, "loss": 0.301, "step": 3108 }, { "epoch": 282.6787878787879, "grad_norm": 0.3798886835575104, "learning_rate": 1.891e-05, "loss": 0.3393, "step": 3109 }, { "epoch": 282.77575757575755, "grad_norm": 0.34574228525161743, "learning_rate": 1.8900000000000002e-05, "loss": 0.3288, "step": 3110 }, { "epoch": 282.77575757575755, "eval_loss": 0.43000346422195435, "eval_runtime": 2.1018, "eval_samples_per_second": 26.168, "eval_steps_per_second": 3.33, "step": 3110 }, { "epoch": 282.8727272727273, "grad_norm": 0.301952600479126, "learning_rate": 1.8890000000000004e-05, "loss": 0.3288, "step": 3111 }, { "epoch": 282.969696969697, "grad_norm": 0.26290667057037354, "learning_rate": 1.888e-05, "loss": 0.3848, "step": 3112 }, { "epoch": 283.0, "grad_norm": 0.46030494570732117, "learning_rate": 1.887e-05, "loss": 0.3212, "step": 3113 }, { "epoch": 283.0969696969697, "grad_norm": 0.3023940324783325, "learning_rate": 1.886e-05, "loss": 0.3092, "step": 3114 }, { "epoch": 283.1939393939394, "grad_norm": 0.32762202620506287, "learning_rate": 1.885e-05, "loss": 0.3419, "step": 3115 }, { "epoch": 283.2909090909091, "grad_norm": 0.28179922699928284, "learning_rate": 1.8840000000000003e-05, "loss": 0.339, "step": 3116 }, { "epoch": 283.3878787878788, "grad_norm": 0.2876780331134796, "learning_rate": 1.883e-05, "loss": 0.3731, "step": 3117 }, { "epoch": 283.4848484848485, "grad_norm": 0.2742939889431, "learning_rate": 1.8820000000000003e-05, "loss": 0.2887, "step": 3118 }, { "epoch": 283.58181818181816, "grad_norm": 0.3161603808403015, "learning_rate": 1.881e-05, "loss": 0.3623, "step": 3119 }, { "epoch": 283.6787878787879, "grad_norm": 0.2728952467441559, "learning_rate": 1.88e-05, "loss": 0.3722, "step": 3120 }, { "epoch": 283.6787878787879, "eval_loss": 0.4307185113430023, "eval_runtime": 2.1149, "eval_samples_per_second": 26.006, "eval_steps_per_second": 3.31, "step": 3120 }, { "epoch": 283.77575757575755, "grad_norm": 0.23937064409255981, "learning_rate": 1.879e-05, "loss": 0.3412, "step": 3121 }, { "epoch": 283.8727272727273, "grad_norm": 0.2939877212047577, "learning_rate": 1.878e-05, "loss": 0.343, "step": 3122 }, { "epoch": 283.969696969697, "grad_norm": 0.3187495768070221, "learning_rate": 1.877e-05, "loss": 0.3873, "step": 3123 }, { "epoch": 284.0, "grad_norm": 0.5333037972450256, "learning_rate": 1.876e-05, "loss": 0.3612, "step": 3124 }, { "epoch": 284.0969696969697, "grad_norm": 0.26920658349990845, "learning_rate": 1.8750000000000002e-05, "loss": 0.3463, "step": 3125 }, { "epoch": 284.1939393939394, "grad_norm": 0.2812713086605072, "learning_rate": 1.8740000000000004e-05, "loss": 0.3433, "step": 3126 }, { "epoch": 284.2909090909091, "grad_norm": 0.28586363792419434, "learning_rate": 1.8730000000000002e-05, "loss": 0.335, "step": 3127 }, { "epoch": 284.3878787878788, "grad_norm": 0.2654596269130707, "learning_rate": 1.872e-05, "loss": 0.36, "step": 3128 }, { "epoch": 284.4848484848485, "grad_norm": 0.30611246824264526, "learning_rate": 1.871e-05, "loss": 0.3661, "step": 3129 }, { "epoch": 284.58181818181816, "grad_norm": 0.2727811634540558, "learning_rate": 1.87e-05, "loss": 0.3221, "step": 3130 }, { "epoch": 284.58181818181816, "eval_loss": 0.4300423562526703, "eval_runtime": 2.1098, "eval_samples_per_second": 26.069, "eval_steps_per_second": 3.318, "step": 3130 }, { "epoch": 284.6787878787879, "grad_norm": 0.3090428113937378, "learning_rate": 1.8690000000000002e-05, "loss": 0.3605, "step": 3131 }, { "epoch": 284.77575757575755, "grad_norm": 0.276574969291687, "learning_rate": 1.868e-05, "loss": 0.3638, "step": 3132 }, { "epoch": 284.8727272727273, "grad_norm": 0.26750755310058594, "learning_rate": 1.8670000000000003e-05, "loss": 0.3213, "step": 3133 }, { "epoch": 284.969696969697, "grad_norm": 0.3467130959033966, "learning_rate": 1.866e-05, "loss": 0.353, "step": 3134 }, { "epoch": 285.0, "grad_norm": 0.37440377473831177, "learning_rate": 1.865e-05, "loss": 0.3205, "step": 3135 }, { "epoch": 285.0969696969697, "grad_norm": 0.2853250801563263, "learning_rate": 1.864e-05, "loss": 0.3198, "step": 3136 }, { "epoch": 285.1939393939394, "grad_norm": 0.2645166218280792, "learning_rate": 1.863e-05, "loss": 0.3163, "step": 3137 }, { "epoch": 285.2909090909091, "grad_norm": 0.2701696455478668, "learning_rate": 1.862e-05, "loss": 0.3391, "step": 3138 }, { "epoch": 285.3878787878788, "grad_norm": 0.312705397605896, "learning_rate": 1.861e-05, "loss": 0.353, "step": 3139 }, { "epoch": 285.4848484848485, "grad_norm": 0.2816062271595001, "learning_rate": 1.86e-05, "loss": 0.35, "step": 3140 }, { "epoch": 285.4848484848485, "eval_loss": 0.43025192618370056, "eval_runtime": 2.108, "eval_samples_per_second": 26.091, "eval_steps_per_second": 3.321, "step": 3140 }, { "epoch": 285.58181818181816, "grad_norm": 0.26278868317604065, "learning_rate": 1.8590000000000003e-05, "loss": 0.3185, "step": 3141 }, { "epoch": 285.6787878787879, "grad_norm": 0.3085387945175171, "learning_rate": 1.858e-05, "loss": 0.3627, "step": 3142 }, { "epoch": 285.77575757575755, "grad_norm": 0.28027740120887756, "learning_rate": 1.857e-05, "loss": 0.3828, "step": 3143 }, { "epoch": 285.8727272727273, "grad_norm": 0.2845478355884552, "learning_rate": 1.856e-05, "loss": 0.3368, "step": 3144 }, { "epoch": 285.969696969697, "grad_norm": 0.3337837755680084, "learning_rate": 1.855e-05, "loss": 0.3708, "step": 3145 }, { "epoch": 286.0, "grad_norm": 0.47991085052490234, "learning_rate": 1.8540000000000002e-05, "loss": 0.3774, "step": 3146 }, { "epoch": 286.0969696969697, "grad_norm": 0.2537936568260193, "learning_rate": 1.853e-05, "loss": 0.3285, "step": 3147 }, { "epoch": 286.1939393939394, "grad_norm": 0.3946389853954315, "learning_rate": 1.8520000000000002e-05, "loss": 0.3513, "step": 3148 }, { "epoch": 286.2909090909091, "grad_norm": 0.2878142297267914, "learning_rate": 1.851e-05, "loss": 0.3444, "step": 3149 }, { "epoch": 286.3878787878788, "grad_norm": 0.28056782484054565, "learning_rate": 1.85e-05, "loss": 0.3674, "step": 3150 }, { "epoch": 286.3878787878788, "eval_loss": 0.42976757884025574, "eval_runtime": 2.0736, "eval_samples_per_second": 26.524, "eval_steps_per_second": 3.376, "step": 3150 }, { "epoch": 286.4848484848485, "grad_norm": 0.30548804998397827, "learning_rate": 1.849e-05, "loss": 0.342, "step": 3151 }, { "epoch": 286.58181818181816, "grad_norm": 0.30740660429000854, "learning_rate": 1.848e-05, "loss": 0.3405, "step": 3152 }, { "epoch": 286.6787878787879, "grad_norm": 0.29892516136169434, "learning_rate": 1.847e-05, "loss": 0.3436, "step": 3153 }, { "epoch": 286.77575757575755, "grad_norm": 0.3422331213951111, "learning_rate": 1.846e-05, "loss": 0.3366, "step": 3154 }, { "epoch": 286.8727272727273, "grad_norm": 0.29767316579818726, "learning_rate": 1.845e-05, "loss": 0.3488, "step": 3155 }, { "epoch": 286.969696969697, "grad_norm": 0.26103007793426514, "learning_rate": 1.8440000000000003e-05, "loss": 0.3691, "step": 3156 }, { "epoch": 287.0, "grad_norm": 0.5521721839904785, "learning_rate": 1.843e-05, "loss": 0.3043, "step": 3157 }, { "epoch": 287.0969696969697, "grad_norm": 0.2533706724643707, "learning_rate": 1.842e-05, "loss": 0.3805, "step": 3158 }, { "epoch": 287.1939393939394, "grad_norm": 0.25408628582954407, "learning_rate": 1.841e-05, "loss": 0.2988, "step": 3159 }, { "epoch": 287.2909090909091, "grad_norm": 0.2781979739665985, "learning_rate": 1.84e-05, "loss": 0.3651, "step": 3160 }, { "epoch": 287.2909090909091, "eval_loss": 0.42997777462005615, "eval_runtime": 2.1056, "eval_samples_per_second": 26.121, "eval_steps_per_second": 3.324, "step": 3160 }, { "epoch": 287.3878787878788, "grad_norm": 0.2921355664730072, "learning_rate": 1.8390000000000002e-05, "loss": 0.3701, "step": 3161 }, { "epoch": 287.4848484848485, "grad_norm": 0.3304900825023651, "learning_rate": 1.838e-05, "loss": 0.339, "step": 3162 }, { "epoch": 287.58181818181816, "grad_norm": 0.3184772729873657, "learning_rate": 1.8370000000000002e-05, "loss": 0.3453, "step": 3163 }, { "epoch": 287.6787878787879, "grad_norm": 0.2869490683078766, "learning_rate": 1.8360000000000004e-05, "loss": 0.3134, "step": 3164 }, { "epoch": 287.77575757575755, "grad_norm": 0.2816450595855713, "learning_rate": 1.8350000000000002e-05, "loss": 0.3279, "step": 3165 }, { "epoch": 287.8727272727273, "grad_norm": 0.3218279182910919, "learning_rate": 1.834e-05, "loss": 0.3545, "step": 3166 }, { "epoch": 287.969696969697, "grad_norm": 0.29858097434043884, "learning_rate": 1.833e-05, "loss": 0.3568, "step": 3167 }, { "epoch": 288.0, "grad_norm": 0.5968769192695618, "learning_rate": 1.832e-05, "loss": 0.3491, "step": 3168 }, { "epoch": 288.0969696969697, "grad_norm": 0.32061246037483215, "learning_rate": 1.8310000000000003e-05, "loss": 0.3503, "step": 3169 }, { "epoch": 288.1939393939394, "grad_norm": 0.28813618421554565, "learning_rate": 1.83e-05, "loss": 0.3622, "step": 3170 }, { "epoch": 288.1939393939394, "eval_loss": 0.4300357401371002, "eval_runtime": 2.105, "eval_samples_per_second": 26.129, "eval_steps_per_second": 3.325, "step": 3170 }, { "epoch": 288.2909090909091, "grad_norm": 0.3164956569671631, "learning_rate": 1.8290000000000003e-05, "loss": 0.3108, "step": 3171 }, { "epoch": 288.3878787878788, "grad_norm": 0.25404638051986694, "learning_rate": 1.828e-05, "loss": 0.3372, "step": 3172 }, { "epoch": 288.4848484848485, "grad_norm": 0.25583022832870483, "learning_rate": 1.827e-05, "loss": 0.3289, "step": 3173 }, { "epoch": 288.58181818181816, "grad_norm": 0.48261046409606934, "learning_rate": 1.826e-05, "loss": 0.3589, "step": 3174 }, { "epoch": 288.6787878787879, "grad_norm": 0.33504432439804077, "learning_rate": 1.825e-05, "loss": 0.3152, "step": 3175 }, { "epoch": 288.77575757575755, "grad_norm": 0.29916203022003174, "learning_rate": 1.824e-05, "loss": 0.3649, "step": 3176 }, { "epoch": 288.8727272727273, "grad_norm": 0.4544450342655182, "learning_rate": 1.823e-05, "loss": 0.3586, "step": 3177 }, { "epoch": 288.969696969697, "grad_norm": 0.3073657155036926, "learning_rate": 1.8220000000000002e-05, "loss": 0.3652, "step": 3178 }, { "epoch": 289.0, "grad_norm": 0.4764782190322876, "learning_rate": 1.8210000000000004e-05, "loss": 0.3451, "step": 3179 }, { "epoch": 289.0969696969697, "grad_norm": 0.2884770929813385, "learning_rate": 1.8200000000000002e-05, "loss": 0.3436, "step": 3180 }, { "epoch": 289.0969696969697, "eval_loss": 0.429638534784317, "eval_runtime": 2.102, "eval_samples_per_second": 26.165, "eval_steps_per_second": 3.33, "step": 3180 }, { "epoch": 289.1939393939394, "grad_norm": 0.2942644953727722, "learning_rate": 1.819e-05, "loss": 0.3399, "step": 3181 }, { "epoch": 289.2909090909091, "grad_norm": 0.30701959133148193, "learning_rate": 1.818e-05, "loss": 0.3338, "step": 3182 }, { "epoch": 289.3878787878788, "grad_norm": 0.3285771608352661, "learning_rate": 1.817e-05, "loss": 0.3458, "step": 3183 }, { "epoch": 289.4848484848485, "grad_norm": 0.27240973711013794, "learning_rate": 1.8160000000000002e-05, "loss": 0.3225, "step": 3184 }, { "epoch": 289.58181818181816, "grad_norm": 0.3030914068222046, "learning_rate": 1.815e-05, "loss": 0.3538, "step": 3185 }, { "epoch": 289.6787878787879, "grad_norm": 0.2722974717617035, "learning_rate": 1.8140000000000003e-05, "loss": 0.3198, "step": 3186 }, { "epoch": 289.77575757575755, "grad_norm": 0.3061240017414093, "learning_rate": 1.813e-05, "loss": 0.366, "step": 3187 }, { "epoch": 289.8727272727273, "grad_norm": 0.30942073464393616, "learning_rate": 1.812e-05, "loss": 0.3788, "step": 3188 }, { "epoch": 289.969696969697, "grad_norm": 0.3707157373428345, "learning_rate": 1.811e-05, "loss": 0.3517, "step": 3189 }, { "epoch": 290.0, "grad_norm": 0.4878753423690796, "learning_rate": 1.81e-05, "loss": 0.3301, "step": 3190 }, { "epoch": 290.0, "eval_loss": 0.4299190938472748, "eval_runtime": 2.1284, "eval_samples_per_second": 25.841, "eval_steps_per_second": 3.289, "step": 3190 }, { "epoch": 290.0969696969697, "grad_norm": 0.29387012124061584, "learning_rate": 1.809e-05, "loss": 0.3544, "step": 3191 }, { "epoch": 290.1939393939394, "grad_norm": 0.29010263085365295, "learning_rate": 1.808e-05, "loss": 0.383, "step": 3192 }, { "epoch": 290.2909090909091, "grad_norm": 0.27737659215927124, "learning_rate": 1.807e-05, "loss": 0.3158, "step": 3193 }, { "epoch": 290.3878787878788, "grad_norm": 0.27322596311569214, "learning_rate": 1.8060000000000003e-05, "loss": 0.3413, "step": 3194 }, { "epoch": 290.4848484848485, "grad_norm": 0.2959296703338623, "learning_rate": 1.805e-05, "loss": 0.3251, "step": 3195 }, { "epoch": 290.58181818181816, "grad_norm": 0.33172884583473206, "learning_rate": 1.804e-05, "loss": 0.3314, "step": 3196 }, { "epoch": 290.6787878787879, "grad_norm": 0.29233667254447937, "learning_rate": 1.803e-05, "loss": 0.3403, "step": 3197 }, { "epoch": 290.77575757575755, "grad_norm": 0.2946917414665222, "learning_rate": 1.802e-05, "loss": 0.3321, "step": 3198 }, { "epoch": 290.8727272727273, "grad_norm": 0.31235837936401367, "learning_rate": 1.8010000000000002e-05, "loss": 0.3635, "step": 3199 }, { "epoch": 290.969696969697, "grad_norm": 0.29859185218811035, "learning_rate": 1.8e-05, "loss": 0.3764, "step": 3200 }, { "epoch": 290.969696969697, "eval_loss": 0.42965859174728394, "eval_runtime": 2.1181, "eval_samples_per_second": 25.967, "eval_steps_per_second": 3.305, "step": 3200 }, { "epoch": 291.0, "grad_norm": 0.7010234594345093, "learning_rate": 1.7990000000000002e-05, "loss": 0.3011, "step": 3201 }, { "epoch": 291.0969696969697, "grad_norm": 0.27527788281440735, "learning_rate": 1.798e-05, "loss": 0.3411, "step": 3202 }, { "epoch": 291.1939393939394, "grad_norm": 0.2753966748714447, "learning_rate": 1.797e-05, "loss": 0.313, "step": 3203 }, { "epoch": 291.2909090909091, "grad_norm": 0.2650189995765686, "learning_rate": 1.796e-05, "loss": 0.322, "step": 3204 }, { "epoch": 291.3878787878788, "grad_norm": 0.2802697718143463, "learning_rate": 1.795e-05, "loss": 0.3403, "step": 3205 }, { "epoch": 291.4848484848485, "grad_norm": 0.33286914229393005, "learning_rate": 1.794e-05, "loss": 0.3379, "step": 3206 }, { "epoch": 291.58181818181816, "grad_norm": 0.30533063411712646, "learning_rate": 1.793e-05, "loss": 0.3336, "step": 3207 }, { "epoch": 291.6787878787879, "grad_norm": 0.3137774169445038, "learning_rate": 1.792e-05, "loss": 0.3487, "step": 3208 }, { "epoch": 291.77575757575755, "grad_norm": 0.37040799856185913, "learning_rate": 1.7910000000000003e-05, "loss": 0.3926, "step": 3209 }, { "epoch": 291.8727272727273, "grad_norm": 0.2896455228328705, "learning_rate": 1.79e-05, "loss": 0.3406, "step": 3210 }, { "epoch": 291.8727272727273, "eval_loss": 0.4292603135108948, "eval_runtime": 2.0907, "eval_samples_per_second": 26.307, "eval_steps_per_second": 3.348, "step": 3210 }, { "epoch": 291.969696969697, "grad_norm": 0.2867777347564697, "learning_rate": 1.789e-05, "loss": 0.3498, "step": 3211 }, { "epoch": 292.0, "grad_norm": 0.4879944324493408, "learning_rate": 1.7879999999999998e-05, "loss": 0.4218, "step": 3212 }, { "epoch": 292.0969696969697, "grad_norm": 0.29318252205848694, "learning_rate": 1.787e-05, "loss": 0.362, "step": 3213 }, { "epoch": 292.1939393939394, "grad_norm": 0.282903790473938, "learning_rate": 1.7860000000000002e-05, "loss": 0.332, "step": 3214 }, { "epoch": 292.2909090909091, "grad_norm": 0.31095901131629944, "learning_rate": 1.785e-05, "loss": 0.3615, "step": 3215 }, { "epoch": 292.3878787878788, "grad_norm": 0.30026254057884216, "learning_rate": 1.7840000000000002e-05, "loss": 0.3512, "step": 3216 }, { "epoch": 292.4848484848485, "grad_norm": 0.2917325496673584, "learning_rate": 1.783e-05, "loss": 0.3206, "step": 3217 }, { "epoch": 292.58181818181816, "grad_norm": 0.29843437671661377, "learning_rate": 1.7820000000000002e-05, "loss": 0.3621, "step": 3218 }, { "epoch": 292.6787878787879, "grad_norm": 0.2887900173664093, "learning_rate": 1.781e-05, "loss": 0.3549, "step": 3219 }, { "epoch": 292.77575757575755, "grad_norm": 0.3294726610183716, "learning_rate": 1.78e-05, "loss": 0.3464, "step": 3220 }, { "epoch": 292.77575757575755, "eval_loss": 0.42934826016426086, "eval_runtime": 2.1632, "eval_samples_per_second": 25.425, "eval_steps_per_second": 3.236, "step": 3220 }, { "epoch": 292.8727272727273, "grad_norm": 0.3146337866783142, "learning_rate": 1.779e-05, "loss": 0.3267, "step": 3221 }, { "epoch": 292.969696969697, "grad_norm": 0.2778744399547577, "learning_rate": 1.7780000000000003e-05, "loss": 0.3511, "step": 3222 }, { "epoch": 293.0, "grad_norm": 0.4883699417114258, "learning_rate": 1.777e-05, "loss": 0.2654, "step": 3223 }, { "epoch": 293.0969696969697, "grad_norm": 0.30644264817237854, "learning_rate": 1.7760000000000003e-05, "loss": 0.3088, "step": 3224 }, { "epoch": 293.1939393939394, "grad_norm": 0.29900655150413513, "learning_rate": 1.775e-05, "loss": 0.3547, "step": 3225 }, { "epoch": 293.2909090909091, "grad_norm": 0.3363381624221802, "learning_rate": 1.774e-05, "loss": 0.3218, "step": 3226 }, { "epoch": 293.3878787878788, "grad_norm": 0.2873002588748932, "learning_rate": 1.773e-05, "loss": 0.3535, "step": 3227 }, { "epoch": 293.4848484848485, "grad_norm": 0.31082361936569214, "learning_rate": 1.772e-05, "loss": 0.3491, "step": 3228 }, { "epoch": 293.58181818181816, "grad_norm": 0.3087656795978546, "learning_rate": 1.771e-05, "loss": 0.3448, "step": 3229 }, { "epoch": 293.6787878787879, "grad_norm": 0.2696332037448883, "learning_rate": 1.77e-05, "loss": 0.3501, "step": 3230 }, { "epoch": 293.6787878787879, "eval_loss": 0.42914244532585144, "eval_runtime": 2.1003, "eval_samples_per_second": 26.186, "eval_steps_per_second": 3.333, "step": 3230 }, { "epoch": 293.77575757575755, "grad_norm": 0.3170471787452698, "learning_rate": 1.7690000000000002e-05, "loss": 0.3262, "step": 3231 }, { "epoch": 293.8727272727273, "grad_norm": 0.2885948121547699, "learning_rate": 1.7680000000000004e-05, "loss": 0.4026, "step": 3232 }, { "epoch": 293.969696969697, "grad_norm": 0.4239996075630188, "learning_rate": 1.7670000000000002e-05, "loss": 0.3271, "step": 3233 }, { "epoch": 294.0, "grad_norm": 0.5607514977455139, "learning_rate": 1.766e-05, "loss": 0.3531, "step": 3234 }, { "epoch": 294.0969696969697, "grad_norm": 0.24723725020885468, "learning_rate": 1.765e-05, "loss": 0.3625, "step": 3235 }, { "epoch": 294.1939393939394, "grad_norm": 0.26862984895706177, "learning_rate": 1.764e-05, "loss": 0.3426, "step": 3236 }, { "epoch": 294.2909090909091, "grad_norm": 0.28407901525497437, "learning_rate": 1.7630000000000002e-05, "loss": 0.3475, "step": 3237 }, { "epoch": 294.3878787878788, "grad_norm": 0.29130661487579346, "learning_rate": 1.762e-05, "loss": 0.3089, "step": 3238 }, { "epoch": 294.4848484848485, "grad_norm": 0.3147640824317932, "learning_rate": 1.7610000000000002e-05, "loss": 0.3193, "step": 3239 }, { "epoch": 294.58181818181816, "grad_norm": 0.3013257384300232, "learning_rate": 1.76e-05, "loss": 0.3466, "step": 3240 }, { "epoch": 294.58181818181816, "eval_loss": 0.4294959604740143, "eval_runtime": 2.1214, "eval_samples_per_second": 25.927, "eval_steps_per_second": 3.3, "step": 3240 }, { "epoch": 294.6787878787879, "grad_norm": 0.36485326290130615, "learning_rate": 1.759e-05, "loss": 0.3797, "step": 3241 }, { "epoch": 294.77575757575755, "grad_norm": 0.37424951791763306, "learning_rate": 1.758e-05, "loss": 0.3067, "step": 3242 }, { "epoch": 294.8727272727273, "grad_norm": 0.29747769236564636, "learning_rate": 1.757e-05, "loss": 0.3655, "step": 3243 }, { "epoch": 294.969696969697, "grad_norm": 0.3009466230869293, "learning_rate": 1.756e-05, "loss": 0.3634, "step": 3244 }, { "epoch": 295.0, "grad_norm": 0.6070283651351929, "learning_rate": 1.755e-05, "loss": 0.3316, "step": 3245 }, { "epoch": 295.0969696969697, "grad_norm": 0.27728110551834106, "learning_rate": 1.754e-05, "loss": 0.3468, "step": 3246 }, { "epoch": 295.1939393939394, "grad_norm": 0.3359808027744293, "learning_rate": 1.7530000000000003e-05, "loss": 0.363, "step": 3247 }, { "epoch": 295.2909090909091, "grad_norm": 0.25140830874443054, "learning_rate": 1.752e-05, "loss": 0.3309, "step": 3248 }, { "epoch": 295.3878787878788, "grad_norm": 0.33611467480659485, "learning_rate": 1.751e-05, "loss": 0.3477, "step": 3249 }, { "epoch": 295.4848484848485, "grad_norm": 0.2783098816871643, "learning_rate": 1.75e-05, "loss": 0.3626, "step": 3250 }, { "epoch": 295.4848484848485, "eval_loss": 0.4290139079093933, "eval_runtime": 2.1355, "eval_samples_per_second": 25.755, "eval_steps_per_second": 3.278, "step": 3250 }, { "epoch": 295.58181818181816, "grad_norm": 0.295444130897522, "learning_rate": 1.749e-05, "loss": 0.3462, "step": 3251 }, { "epoch": 295.6787878787879, "grad_norm": 0.2505262494087219, "learning_rate": 1.7480000000000002e-05, "loss": 0.3365, "step": 3252 }, { "epoch": 295.77575757575755, "grad_norm": 0.2934752106666565, "learning_rate": 1.747e-05, "loss": 0.3433, "step": 3253 }, { "epoch": 295.8727272727273, "grad_norm": 0.31944575905799866, "learning_rate": 1.7460000000000002e-05, "loss": 0.3482, "step": 3254 }, { "epoch": 295.969696969697, "grad_norm": 0.2714713215827942, "learning_rate": 1.745e-05, "loss": 0.3014, "step": 3255 }, { "epoch": 296.0, "grad_norm": 0.543958842754364, "learning_rate": 1.7440000000000002e-05, "loss": 0.3689, "step": 3256 }, { "epoch": 296.0969696969697, "grad_norm": 0.30188027024269104, "learning_rate": 1.743e-05, "loss": 0.3403, "step": 3257 }, { "epoch": 296.1939393939394, "grad_norm": 0.3010038733482361, "learning_rate": 1.742e-05, "loss": 0.3497, "step": 3258 }, { "epoch": 296.2909090909091, "grad_norm": 0.29702892899513245, "learning_rate": 1.741e-05, "loss": 0.3126, "step": 3259 }, { "epoch": 296.3878787878788, "grad_norm": 0.31002187728881836, "learning_rate": 1.74e-05, "loss": 0.3697, "step": 3260 }, { "epoch": 296.3878787878788, "eval_loss": 0.42891135811805725, "eval_runtime": 2.1018, "eval_samples_per_second": 26.168, "eval_steps_per_second": 3.33, "step": 3260 }, { "epoch": 296.4848484848485, "grad_norm": 0.2677341401576996, "learning_rate": 1.739e-05, "loss": 0.3266, "step": 3261 }, { "epoch": 296.58181818181816, "grad_norm": 0.3090519607067108, "learning_rate": 1.7380000000000003e-05, "loss": 0.3657, "step": 3262 }, { "epoch": 296.6787878787879, "grad_norm": 0.2831075191497803, "learning_rate": 1.737e-05, "loss": 0.3482, "step": 3263 }, { "epoch": 296.77575757575755, "grad_norm": 0.3414286673069, "learning_rate": 1.736e-05, "loss": 0.3999, "step": 3264 }, { "epoch": 296.8727272727273, "grad_norm": 0.3059631884098053, "learning_rate": 1.7349999999999998e-05, "loss": 0.327, "step": 3265 }, { "epoch": 296.969696969697, "grad_norm": 0.2777879536151886, "learning_rate": 1.734e-05, "loss": 0.325, "step": 3266 }, { "epoch": 297.0, "grad_norm": 0.5726981163024902, "learning_rate": 1.7330000000000002e-05, "loss": 0.2346, "step": 3267 }, { "epoch": 297.0969696969697, "grad_norm": 0.251736581325531, "learning_rate": 1.732e-05, "loss": 0.3528, "step": 3268 }, { "epoch": 297.1939393939394, "grad_norm": 0.3111477792263031, "learning_rate": 1.7310000000000002e-05, "loss": 0.3494, "step": 3269 }, { "epoch": 297.2909090909091, "grad_norm": 0.26640409231185913, "learning_rate": 1.73e-05, "loss": 0.3449, "step": 3270 }, { "epoch": 297.2909090909091, "eval_loss": 0.4294259548187256, "eval_runtime": 2.139, "eval_samples_per_second": 25.713, "eval_steps_per_second": 3.273, "step": 3270 }, { "epoch": 297.3878787878788, "grad_norm": 0.28788143396377563, "learning_rate": 1.7290000000000002e-05, "loss": 0.3584, "step": 3271 }, { "epoch": 297.4848484848485, "grad_norm": 0.3301577866077423, "learning_rate": 1.728e-05, "loss": 0.3715, "step": 3272 }, { "epoch": 297.58181818181816, "grad_norm": 0.3239557147026062, "learning_rate": 1.727e-05, "loss": 0.326, "step": 3273 }, { "epoch": 297.6787878787879, "grad_norm": 0.260246217250824, "learning_rate": 1.726e-05, "loss": 0.3456, "step": 3274 }, { "epoch": 297.77575757575755, "grad_norm": 0.3090617060661316, "learning_rate": 1.725e-05, "loss": 0.3487, "step": 3275 }, { "epoch": 297.8727272727273, "grad_norm": 0.2856177091598511, "learning_rate": 1.724e-05, "loss": 0.2956, "step": 3276 }, { "epoch": 297.969696969697, "grad_norm": 0.31400689482688904, "learning_rate": 1.7230000000000003e-05, "loss": 0.341, "step": 3277 }, { "epoch": 298.0, "grad_norm": 0.5386035442352295, "learning_rate": 1.722e-05, "loss": 0.3256, "step": 3278 }, { "epoch": 298.0969696969697, "grad_norm": 0.31615230441093445, "learning_rate": 1.721e-05, "loss": 0.3486, "step": 3279 }, { "epoch": 298.1939393939394, "grad_norm": 0.3985329568386078, "learning_rate": 1.7199999999999998e-05, "loss": 0.3398, "step": 3280 }, { "epoch": 298.1939393939394, "eval_loss": 0.4285666346549988, "eval_runtime": 2.1236, "eval_samples_per_second": 25.899, "eval_steps_per_second": 3.296, "step": 3280 }, { "epoch": 298.2909090909091, "grad_norm": 0.24339041113853455, "learning_rate": 1.719e-05, "loss": 0.3374, "step": 3281 }, { "epoch": 298.3878787878788, "grad_norm": 0.27008259296417236, "learning_rate": 1.718e-05, "loss": 0.3551, "step": 3282 }, { "epoch": 298.4848484848485, "grad_norm": 0.2825574576854706, "learning_rate": 1.717e-05, "loss": 0.3557, "step": 3283 }, { "epoch": 298.58181818181816, "grad_norm": 0.3063613772392273, "learning_rate": 1.7160000000000002e-05, "loss": 0.3248, "step": 3284 }, { "epoch": 298.6787878787879, "grad_norm": 0.3211213946342468, "learning_rate": 1.7150000000000004e-05, "loss": 0.3411, "step": 3285 }, { "epoch": 298.77575757575755, "grad_norm": 0.2771667540073395, "learning_rate": 1.7140000000000002e-05, "loss": 0.335, "step": 3286 }, { "epoch": 298.8727272727273, "grad_norm": 0.3101070821285248, "learning_rate": 1.713e-05, "loss": 0.3573, "step": 3287 }, { "epoch": 298.969696969697, "grad_norm": 0.2912956178188324, "learning_rate": 1.712e-05, "loss": 0.3304, "step": 3288 }, { "epoch": 299.0, "grad_norm": 0.5872663855552673, "learning_rate": 1.711e-05, "loss": 0.3486, "step": 3289 }, { "epoch": 299.0969696969697, "grad_norm": 0.30190402269363403, "learning_rate": 1.7100000000000002e-05, "loss": 0.3315, "step": 3290 }, { "epoch": 299.0969696969697, "eval_loss": 0.42890438437461853, "eval_runtime": 2.1137, "eval_samples_per_second": 26.02, "eval_steps_per_second": 3.312, "step": 3290 }, { "epoch": 299.1939393939394, "grad_norm": 0.3438356816768646, "learning_rate": 1.709e-05, "loss": 0.3138, "step": 3291 }, { "epoch": 299.2909090909091, "grad_norm": 0.2754231095314026, "learning_rate": 1.7080000000000002e-05, "loss": 0.3508, "step": 3292 }, { "epoch": 299.3878787878788, "grad_norm": 0.27830561995506287, "learning_rate": 1.707e-05, "loss": 0.3243, "step": 3293 }, { "epoch": 299.4848484848485, "grad_norm": 0.3274482488632202, "learning_rate": 1.706e-05, "loss": 0.3398, "step": 3294 }, { "epoch": 299.58181818181816, "grad_norm": 0.34774988889694214, "learning_rate": 1.705e-05, "loss": 0.3373, "step": 3295 }, { "epoch": 299.6787878787879, "grad_norm": 0.3125561773777008, "learning_rate": 1.704e-05, "loss": 0.3228, "step": 3296 }, { "epoch": 299.77575757575755, "grad_norm": 0.2622632086277008, "learning_rate": 1.703e-05, "loss": 0.3725, "step": 3297 }, { "epoch": 299.8727272727273, "grad_norm": 0.2701694965362549, "learning_rate": 1.702e-05, "loss": 0.3695, "step": 3298 }, { "epoch": 299.969696969697, "grad_norm": 0.3075459897518158, "learning_rate": 1.701e-05, "loss": 0.354, "step": 3299 }, { "epoch": 300.0, "grad_norm": 0.5179641842842102, "learning_rate": 1.7000000000000003e-05, "loss": 0.3704, "step": 3300 }, { "epoch": 300.0, "eval_loss": 0.4286232888698578, "eval_runtime": 2.1146, "eval_samples_per_second": 26.01, "eval_steps_per_second": 3.31, "step": 3300 }, { "epoch": 300.0969696969697, "grad_norm": 0.2588394284248352, "learning_rate": 1.699e-05, "loss": 0.3211, "step": 3301 }, { "epoch": 300.1939393939394, "grad_norm": 0.2719418704509735, "learning_rate": 1.698e-05, "loss": 0.3314, "step": 3302 }, { "epoch": 300.2909090909091, "grad_norm": 0.2840977907180786, "learning_rate": 1.697e-05, "loss": 0.3335, "step": 3303 }, { "epoch": 300.3878787878788, "grad_norm": 0.3237716257572174, "learning_rate": 1.696e-05, "loss": 0.3296, "step": 3304 }, { "epoch": 300.4848484848485, "grad_norm": 0.3320935368537903, "learning_rate": 1.6950000000000002e-05, "loss": 0.3671, "step": 3305 }, { "epoch": 300.58181818181816, "grad_norm": 0.5175302028656006, "learning_rate": 1.694e-05, "loss": 0.3522, "step": 3306 }, { "epoch": 300.6787878787879, "grad_norm": 0.3399061858654022, "learning_rate": 1.6930000000000002e-05, "loss": 0.3604, "step": 3307 }, { "epoch": 300.77575757575755, "grad_norm": 0.2818537950515747, "learning_rate": 1.692e-05, "loss": 0.3422, "step": 3308 }, { "epoch": 300.8727272727273, "grad_norm": 0.3524338901042938, "learning_rate": 1.6910000000000002e-05, "loss": 0.3142, "step": 3309 }, { "epoch": 300.969696969697, "grad_norm": 0.284162312746048, "learning_rate": 1.69e-05, "loss": 0.3543, "step": 3310 }, { "epoch": 300.969696969697, "eval_loss": 0.42852863669395447, "eval_runtime": 2.1374, "eval_samples_per_second": 25.732, "eval_steps_per_second": 3.275, "step": 3310 }, { "epoch": 301.0, "grad_norm": 0.6364848613739014, "learning_rate": 1.689e-05, "loss": 0.4003, "step": 3311 }, { "epoch": 301.0969696969697, "grad_norm": 0.45382216572761536, "learning_rate": 1.688e-05, "loss": 0.3158, "step": 3312 }, { "epoch": 301.1939393939394, "grad_norm": 0.3232753872871399, "learning_rate": 1.687e-05, "loss": 0.3746, "step": 3313 }, { "epoch": 301.2909090909091, "grad_norm": 0.3339671194553375, "learning_rate": 1.686e-05, "loss": 0.3444, "step": 3314 }, { "epoch": 301.3878787878788, "grad_norm": 0.33689528703689575, "learning_rate": 1.6850000000000003e-05, "loss": 0.3493, "step": 3315 }, { "epoch": 301.4848484848485, "grad_norm": 0.32228925824165344, "learning_rate": 1.684e-05, "loss": 0.344, "step": 3316 }, { "epoch": 301.58181818181816, "grad_norm": 0.2638039290904999, "learning_rate": 1.683e-05, "loss": 0.3418, "step": 3317 }, { "epoch": 301.6787878787879, "grad_norm": 0.3011254072189331, "learning_rate": 1.6819999999999998e-05, "loss": 0.3614, "step": 3318 }, { "epoch": 301.77575757575755, "grad_norm": 0.28206703066825867, "learning_rate": 1.681e-05, "loss": 0.3241, "step": 3319 }, { "epoch": 301.8727272727273, "grad_norm": 0.2938118875026703, "learning_rate": 1.6800000000000002e-05, "loss": 0.3496, "step": 3320 }, { "epoch": 301.8727272727273, "eval_loss": 0.4284074008464813, "eval_runtime": 2.1364, "eval_samples_per_second": 25.744, "eval_steps_per_second": 3.277, "step": 3320 }, { "epoch": 301.969696969697, "grad_norm": 0.28451794385910034, "learning_rate": 1.679e-05, "loss": 0.3157, "step": 3321 }, { "epoch": 302.0, "grad_norm": 0.41703781485557556, "learning_rate": 1.6780000000000002e-05, "loss": 0.3465, "step": 3322 }, { "epoch": 302.0969696969697, "grad_norm": 0.28743523359298706, "learning_rate": 1.677e-05, "loss": 0.3253, "step": 3323 }, { "epoch": 302.1939393939394, "grad_norm": 0.26439985632896423, "learning_rate": 1.6760000000000002e-05, "loss": 0.3537, "step": 3324 }, { "epoch": 302.2909090909091, "grad_norm": 0.28935468196868896, "learning_rate": 1.675e-05, "loss": 0.3165, "step": 3325 }, { "epoch": 302.3878787878788, "grad_norm": 0.2709727883338928, "learning_rate": 1.674e-05, "loss": 0.3501, "step": 3326 }, { "epoch": 302.4848484848485, "grad_norm": 0.3182224631309509, "learning_rate": 1.673e-05, "loss": 0.3083, "step": 3327 }, { "epoch": 302.58181818181816, "grad_norm": 0.3185989558696747, "learning_rate": 1.672e-05, "loss": 0.3492, "step": 3328 }, { "epoch": 302.6787878787879, "grad_norm": 0.2719970643520355, "learning_rate": 1.671e-05, "loss": 0.353, "step": 3329 }, { "epoch": 302.77575757575755, "grad_norm": 0.2939992845058441, "learning_rate": 1.6700000000000003e-05, "loss": 0.342, "step": 3330 }, { "epoch": 302.77575757575755, "eval_loss": 0.4282497465610504, "eval_runtime": 2.1569, "eval_samples_per_second": 25.5, "eval_steps_per_second": 3.245, "step": 3330 }, { "epoch": 302.8727272727273, "grad_norm": 0.2789037823677063, "learning_rate": 1.669e-05, "loss": 0.3246, "step": 3331 }, { "epoch": 302.969696969697, "grad_norm": 0.32086724042892456, "learning_rate": 1.668e-05, "loss": 0.382, "step": 3332 }, { "epoch": 303.0, "grad_norm": 0.5400450229644775, "learning_rate": 1.6669999999999998e-05, "loss": 0.3965, "step": 3333 }, { "epoch": 303.0969696969697, "grad_norm": 0.28759241104125977, "learning_rate": 1.666e-05, "loss": 0.3688, "step": 3334 }, { "epoch": 303.1939393939394, "grad_norm": 0.26669740676879883, "learning_rate": 1.665e-05, "loss": 0.3256, "step": 3335 }, { "epoch": 303.2909090909091, "grad_norm": 0.29431137442588806, "learning_rate": 1.664e-05, "loss": 0.3388, "step": 3336 }, { "epoch": 303.3878787878788, "grad_norm": 0.27999913692474365, "learning_rate": 1.6630000000000002e-05, "loss": 0.352, "step": 3337 }, { "epoch": 303.4848484848485, "grad_norm": 0.28299450874328613, "learning_rate": 1.662e-05, "loss": 0.3548, "step": 3338 }, { "epoch": 303.58181818181816, "grad_norm": 0.3314816355705261, "learning_rate": 1.6610000000000002e-05, "loss": 0.3551, "step": 3339 }, { "epoch": 303.6787878787879, "grad_norm": 0.2577263414859772, "learning_rate": 1.66e-05, "loss": 0.3113, "step": 3340 }, { "epoch": 303.6787878787879, "eval_loss": 0.42867740988731384, "eval_runtime": 2.1184, "eval_samples_per_second": 25.964, "eval_steps_per_second": 3.304, "step": 3340 }, { "epoch": 303.77575757575755, "grad_norm": 0.27317485213279724, "learning_rate": 1.659e-05, "loss": 0.3556, "step": 3341 }, { "epoch": 303.8727272727273, "grad_norm": 0.3101962208747864, "learning_rate": 1.658e-05, "loss": 0.3076, "step": 3342 }, { "epoch": 303.969696969697, "grad_norm": 0.2733157277107239, "learning_rate": 1.657e-05, "loss": 0.3224, "step": 3343 }, { "epoch": 304.0, "grad_norm": 0.6002190113067627, "learning_rate": 1.656e-05, "loss": 0.4141, "step": 3344 }, { "epoch": 304.0969696969697, "grad_norm": 0.2699577808380127, "learning_rate": 1.6550000000000002e-05, "loss": 0.3271, "step": 3345 }, { "epoch": 304.1939393939394, "grad_norm": 0.26280707120895386, "learning_rate": 1.654e-05, "loss": 0.3375, "step": 3346 }, { "epoch": 304.2909090909091, "grad_norm": 0.28962603211402893, "learning_rate": 1.6530000000000003e-05, "loss": 0.3411, "step": 3347 }, { "epoch": 304.3878787878788, "grad_norm": 0.37396422028541565, "learning_rate": 1.652e-05, "loss": 0.3239, "step": 3348 }, { "epoch": 304.4848484848485, "grad_norm": 0.2670392096042633, "learning_rate": 1.651e-05, "loss": 0.3541, "step": 3349 }, { "epoch": 304.58181818181816, "grad_norm": 0.3426806628704071, "learning_rate": 1.65e-05, "loss": 0.3417, "step": 3350 }, { "epoch": 304.58181818181816, "eval_loss": 0.42846569418907166, "eval_runtime": 2.1309, "eval_samples_per_second": 25.811, "eval_steps_per_second": 3.285, "step": 3350 }, { "epoch": 304.6787878787879, "grad_norm": 0.3189926743507385, "learning_rate": 1.649e-05, "loss": 0.3505, "step": 3351 }, { "epoch": 304.77575757575755, "grad_norm": 0.29972100257873535, "learning_rate": 1.648e-05, "loss": 0.367, "step": 3352 }, { "epoch": 304.8727272727273, "grad_norm": 0.3046066164970398, "learning_rate": 1.6470000000000003e-05, "loss": 0.3114, "step": 3353 }, { "epoch": 304.969696969697, "grad_norm": 0.29538267850875854, "learning_rate": 1.646e-05, "loss": 0.3604, "step": 3354 }, { "epoch": 305.0, "grad_norm": 0.48789748549461365, "learning_rate": 1.645e-05, "loss": 0.3483, "step": 3355 }, { "epoch": 305.0969696969697, "grad_norm": 0.293074369430542, "learning_rate": 1.644e-05, "loss": 0.3444, "step": 3356 }, { "epoch": 305.1939393939394, "grad_norm": 0.3257480263710022, "learning_rate": 1.643e-05, "loss": 0.3487, "step": 3357 }, { "epoch": 305.2909090909091, "grad_norm": 0.29099413752555847, "learning_rate": 1.6420000000000002e-05, "loss": 0.3707, "step": 3358 }, { "epoch": 305.3878787878788, "grad_norm": 0.2569948434829712, "learning_rate": 1.641e-05, "loss": 0.3147, "step": 3359 }, { "epoch": 305.4848484848485, "grad_norm": 0.3167359232902527, "learning_rate": 1.6400000000000002e-05, "loss": 0.339, "step": 3360 }, { "epoch": 305.4848484848485, "eval_loss": 0.42803701758384705, "eval_runtime": 2.1427, "eval_samples_per_second": 25.668, "eval_steps_per_second": 3.267, "step": 3360 }, { "epoch": 305.58181818181816, "grad_norm": 0.2881450653076172, "learning_rate": 1.639e-05, "loss": 0.3366, "step": 3361 }, { "epoch": 305.6787878787879, "grad_norm": 0.22674117982387543, "learning_rate": 1.6380000000000002e-05, "loss": 0.3346, "step": 3362 }, { "epoch": 305.77575757575755, "grad_norm": 0.34561410546302795, "learning_rate": 1.637e-05, "loss": 0.3172, "step": 3363 }, { "epoch": 305.8727272727273, "grad_norm": 0.3139340281486511, "learning_rate": 1.636e-05, "loss": 0.3488, "step": 3364 }, { "epoch": 305.969696969697, "grad_norm": 0.28778886795043945, "learning_rate": 1.635e-05, "loss": 0.3472, "step": 3365 }, { "epoch": 306.0, "grad_norm": 0.599293053150177, "learning_rate": 1.634e-05, "loss": 0.3715, "step": 3366 }, { "epoch": 306.0969696969697, "grad_norm": 0.33993762731552124, "learning_rate": 1.633e-05, "loss": 0.3552, "step": 3367 }, { "epoch": 306.1939393939394, "grad_norm": 0.29261258244514465, "learning_rate": 1.6320000000000003e-05, "loss": 0.341, "step": 3368 }, { "epoch": 306.2909090909091, "grad_norm": 0.30039000511169434, "learning_rate": 1.631e-05, "loss": 0.334, "step": 3369 }, { "epoch": 306.3878787878788, "grad_norm": 0.3208199739456177, "learning_rate": 1.63e-05, "loss": 0.3273, "step": 3370 }, { "epoch": 306.3878787878788, "eval_loss": 0.4288559854030609, "eval_runtime": 2.1164, "eval_samples_per_second": 25.987, "eval_steps_per_second": 3.307, "step": 3370 }, { "epoch": 306.4848484848485, "grad_norm": 0.3807413578033447, "learning_rate": 1.6289999999999998e-05, "loss": 0.2988, "step": 3371 }, { "epoch": 306.58181818181816, "grad_norm": 0.3323957920074463, "learning_rate": 1.628e-05, "loss": 0.3371, "step": 3372 }, { "epoch": 306.6787878787879, "grad_norm": 0.27052778005599976, "learning_rate": 1.6270000000000002e-05, "loss": 0.3627, "step": 3373 }, { "epoch": 306.77575757575755, "grad_norm": 0.3271731734275818, "learning_rate": 1.626e-05, "loss": 0.3724, "step": 3374 }, { "epoch": 306.8727272727273, "grad_norm": 0.26784583926200867, "learning_rate": 1.6250000000000002e-05, "loss": 0.3475, "step": 3375 }, { "epoch": 306.969696969697, "grad_norm": 0.3374277949333191, "learning_rate": 1.624e-05, "loss": 0.3392, "step": 3376 }, { "epoch": 307.0, "grad_norm": 0.4716193675994873, "learning_rate": 1.6230000000000002e-05, "loss": 0.3315, "step": 3377 }, { "epoch": 307.0969696969697, "grad_norm": 0.2882108986377716, "learning_rate": 1.622e-05, "loss": 0.3224, "step": 3378 }, { "epoch": 307.1939393939394, "grad_norm": 0.32774612307548523, "learning_rate": 1.621e-05, "loss": 0.3578, "step": 3379 }, { "epoch": 307.2909090909091, "grad_norm": 0.40872931480407715, "learning_rate": 1.62e-05, "loss": 0.347, "step": 3380 }, { "epoch": 307.2909090909091, "eval_loss": 0.4279690086841583, "eval_runtime": 2.1221, "eval_samples_per_second": 25.918, "eval_steps_per_second": 3.299, "step": 3380 }, { "epoch": 307.3878787878788, "grad_norm": 0.317185640335083, "learning_rate": 1.619e-05, "loss": 0.3236, "step": 3381 }, { "epoch": 307.4848484848485, "grad_norm": 0.3067472577095032, "learning_rate": 1.618e-05, "loss": 0.3414, "step": 3382 }, { "epoch": 307.58181818181816, "grad_norm": 0.2706355154514313, "learning_rate": 1.6170000000000003e-05, "loss": 0.3488, "step": 3383 }, { "epoch": 307.6787878787879, "grad_norm": 0.26995623111724854, "learning_rate": 1.616e-05, "loss": 0.3501, "step": 3384 }, { "epoch": 307.77575757575755, "grad_norm": 0.3374771177768707, "learning_rate": 1.6150000000000003e-05, "loss": 0.331, "step": 3385 }, { "epoch": 307.8727272727273, "grad_norm": 0.29605382680892944, "learning_rate": 1.6139999999999998e-05, "loss": 0.3445, "step": 3386 }, { "epoch": 307.969696969697, "grad_norm": 0.3035816550254822, "learning_rate": 1.613e-05, "loss": 0.3479, "step": 3387 }, { "epoch": 308.0, "grad_norm": 0.4600336253643036, "learning_rate": 1.612e-05, "loss": 0.3264, "step": 3388 }, { "epoch": 308.0969696969697, "grad_norm": 0.28409048914909363, "learning_rate": 1.611e-05, "loss": 0.3152, "step": 3389 }, { "epoch": 308.1939393939394, "grad_norm": 0.26470401883125305, "learning_rate": 1.6100000000000002e-05, "loss": 0.3355, "step": 3390 }, { "epoch": 308.1939393939394, "eval_loss": 0.42832431197166443, "eval_runtime": 2.1153, "eval_samples_per_second": 26.001, "eval_steps_per_second": 3.309, "step": 3390 }, { "epoch": 308.2909090909091, "grad_norm": 0.33754292130470276, "learning_rate": 1.609e-05, "loss": 0.3398, "step": 3391 }, { "epoch": 308.3878787878788, "grad_norm": 0.32827553153038025, "learning_rate": 1.6080000000000002e-05, "loss": 0.396, "step": 3392 }, { "epoch": 308.4848484848485, "grad_norm": 0.3123801052570343, "learning_rate": 1.607e-05, "loss": 0.3454, "step": 3393 }, { "epoch": 308.58181818181816, "grad_norm": 0.31118044257164, "learning_rate": 1.606e-05, "loss": 0.3468, "step": 3394 }, { "epoch": 308.6787878787879, "grad_norm": 0.32048311829566956, "learning_rate": 1.605e-05, "loss": 0.3689, "step": 3395 }, { "epoch": 308.77575757575755, "grad_norm": 0.3279634416103363, "learning_rate": 1.604e-05, "loss": 0.3171, "step": 3396 }, { "epoch": 308.8727272727273, "grad_norm": 0.2815092206001282, "learning_rate": 1.603e-05, "loss": 0.3118, "step": 3397 }, { "epoch": 308.969696969697, "grad_norm": 0.30864110589027405, "learning_rate": 1.6020000000000002e-05, "loss": 0.3363, "step": 3398 }, { "epoch": 309.0, "grad_norm": 0.55268794298172, "learning_rate": 1.601e-05, "loss": 0.3085, "step": 3399 }, { "epoch": 309.0969696969697, "grad_norm": 0.31190842390060425, "learning_rate": 1.6000000000000003e-05, "loss": 0.3552, "step": 3400 }, { "epoch": 309.0969696969697, "eval_loss": 0.4276943504810333, "eval_runtime": 2.1232, "eval_samples_per_second": 25.904, "eval_steps_per_second": 3.297, "step": 3400 }, { "epoch": 309.1939393939394, "grad_norm": 0.2999674081802368, "learning_rate": 1.599e-05, "loss": 0.3228, "step": 3401 }, { "epoch": 309.2909090909091, "grad_norm": 0.2836794853210449, "learning_rate": 1.598e-05, "loss": 0.3026, "step": 3402 }, { "epoch": 309.3878787878788, "grad_norm": 0.3843519687652588, "learning_rate": 1.597e-05, "loss": 0.3681, "step": 3403 }, { "epoch": 309.4848484848485, "grad_norm": 0.269823282957077, "learning_rate": 1.596e-05, "loss": 0.3443, "step": 3404 }, { "epoch": 309.58181818181816, "grad_norm": 0.291311115026474, "learning_rate": 1.595e-05, "loss": 0.3341, "step": 3405 }, { "epoch": 309.6787878787879, "grad_norm": 0.29707270860671997, "learning_rate": 1.594e-05, "loss": 0.3522, "step": 3406 }, { "epoch": 309.77575757575755, "grad_norm": 0.3027060925960541, "learning_rate": 1.593e-05, "loss": 0.3636, "step": 3407 }, { "epoch": 309.8727272727273, "grad_norm": 0.3122222423553467, "learning_rate": 1.592e-05, "loss": 0.3476, "step": 3408 }, { "epoch": 309.969696969697, "grad_norm": 0.270575612783432, "learning_rate": 1.591e-05, "loss": 0.3181, "step": 3409 }, { "epoch": 310.0, "grad_norm": 0.5845109224319458, "learning_rate": 1.59e-05, "loss": 0.3362, "step": 3410 }, { "epoch": 310.0, "eval_loss": 0.42802172899246216, "eval_runtime": 2.127, "eval_samples_per_second": 25.858, "eval_steps_per_second": 3.291, "step": 3410 }, { "epoch": 310.0969696969697, "grad_norm": 0.2749779522418976, "learning_rate": 1.5890000000000002e-05, "loss": 0.329, "step": 3411 }, { "epoch": 310.1939393939394, "grad_norm": 0.3250596821308136, "learning_rate": 1.588e-05, "loss": 0.3682, "step": 3412 }, { "epoch": 310.2909090909091, "grad_norm": 0.3222499489784241, "learning_rate": 1.5870000000000002e-05, "loss": 0.3753, "step": 3413 }, { "epoch": 310.3878787878788, "grad_norm": 0.3307608366012573, "learning_rate": 1.586e-05, "loss": 0.3522, "step": 3414 }, { "epoch": 310.4848484848485, "grad_norm": 0.2511182725429535, "learning_rate": 1.5850000000000002e-05, "loss": 0.34, "step": 3415 }, { "epoch": 310.58181818181816, "grad_norm": 0.2914133071899414, "learning_rate": 1.584e-05, "loss": 0.297, "step": 3416 }, { "epoch": 310.6787878787879, "grad_norm": 0.26581066846847534, "learning_rate": 1.583e-05, "loss": 0.3568, "step": 3417 }, { "epoch": 310.77575757575755, "grad_norm": 0.35770997405052185, "learning_rate": 1.582e-05, "loss": 0.3209, "step": 3418 }, { "epoch": 310.8727272727273, "grad_norm": 0.3471708595752716, "learning_rate": 1.581e-05, "loss": 0.3288, "step": 3419 }, { "epoch": 310.969696969697, "grad_norm": 0.3137297034263611, "learning_rate": 1.58e-05, "loss": 0.3496, "step": 3420 }, { "epoch": 310.969696969697, "eval_loss": 0.4279772639274597, "eval_runtime": 2.1187, "eval_samples_per_second": 25.959, "eval_steps_per_second": 3.304, "step": 3420 }, { "epoch": 311.0, "grad_norm": 0.5063175559043884, "learning_rate": 1.5790000000000003e-05, "loss": 0.2936, "step": 3421 }, { "epoch": 311.0969696969697, "grad_norm": 0.299641877412796, "learning_rate": 1.578e-05, "loss": 0.3563, "step": 3422 }, { "epoch": 311.1939393939394, "grad_norm": 0.2759750485420227, "learning_rate": 1.577e-05, "loss": 0.3226, "step": 3423 }, { "epoch": 311.2909090909091, "grad_norm": 0.34390154480934143, "learning_rate": 1.5759999999999998e-05, "loss": 0.3265, "step": 3424 }, { "epoch": 311.3878787878788, "grad_norm": 0.3145259618759155, "learning_rate": 1.575e-05, "loss": 0.344, "step": 3425 }, { "epoch": 311.4848484848485, "grad_norm": 0.3961293697357178, "learning_rate": 1.5740000000000002e-05, "loss": 0.3178, "step": 3426 }, { "epoch": 311.58181818181816, "grad_norm": 0.3132284879684448, "learning_rate": 1.573e-05, "loss": 0.3608, "step": 3427 }, { "epoch": 311.6787878787879, "grad_norm": 0.2846105992794037, "learning_rate": 1.5720000000000002e-05, "loss": 0.3414, "step": 3428 }, { "epoch": 311.77575757575755, "grad_norm": 0.3138156235218048, "learning_rate": 1.571e-05, "loss": 0.3688, "step": 3429 }, { "epoch": 311.8727272727273, "grad_norm": 0.3472890853881836, "learning_rate": 1.5700000000000002e-05, "loss": 0.3496, "step": 3430 }, { "epoch": 311.8727272727273, "eval_loss": 0.4282764494419098, "eval_runtime": 2.1266, "eval_samples_per_second": 25.863, "eval_steps_per_second": 3.292, "step": 3430 }, { "epoch": 311.969696969697, "grad_norm": 0.27399343252182007, "learning_rate": 1.569e-05, "loss": 0.3055, "step": 3431 }, { "epoch": 312.0, "grad_norm": 0.5205172896385193, "learning_rate": 1.568e-05, "loss": 0.3608, "step": 3432 }, { "epoch": 312.0969696969697, "grad_norm": 0.30829933285713196, "learning_rate": 1.567e-05, "loss": 0.306, "step": 3433 }, { "epoch": 312.1939393939394, "grad_norm": 0.3162461519241333, "learning_rate": 1.566e-05, "loss": 0.3345, "step": 3434 }, { "epoch": 312.2909090909091, "grad_norm": 0.28259918093681335, "learning_rate": 1.565e-05, "loss": 0.3672, "step": 3435 }, { "epoch": 312.3878787878788, "grad_norm": 0.28280651569366455, "learning_rate": 1.5640000000000003e-05, "loss": 0.3485, "step": 3436 }, { "epoch": 312.4848484848485, "grad_norm": 0.29870906472206116, "learning_rate": 1.563e-05, "loss": 0.3635, "step": 3437 }, { "epoch": 312.58181818181816, "grad_norm": 0.30681851506233215, "learning_rate": 1.5620000000000003e-05, "loss": 0.3335, "step": 3438 }, { "epoch": 312.6787878787879, "grad_norm": 0.29939502477645874, "learning_rate": 1.561e-05, "loss": 0.3349, "step": 3439 }, { "epoch": 312.77575757575755, "grad_norm": 0.3255018889904022, "learning_rate": 1.56e-05, "loss": 0.3386, "step": 3440 }, { "epoch": 312.77575757575755, "eval_loss": 0.4275350272655487, "eval_runtime": 2.1392, "eval_samples_per_second": 25.711, "eval_steps_per_second": 3.272, "step": 3440 }, { "epoch": 312.8727272727273, "grad_norm": 0.29633569717407227, "learning_rate": 1.559e-05, "loss": 0.3466, "step": 3441 }, { "epoch": 312.969696969697, "grad_norm": 0.26882505416870117, "learning_rate": 1.558e-05, "loss": 0.3449, "step": 3442 }, { "epoch": 313.0, "grad_norm": 0.43489527702331543, "learning_rate": 1.5570000000000002e-05, "loss": 0.2805, "step": 3443 }, { "epoch": 313.0969696969697, "grad_norm": 0.28184351325035095, "learning_rate": 1.556e-05, "loss": 0.3146, "step": 3444 }, { "epoch": 313.1939393939394, "grad_norm": 0.32311639189720154, "learning_rate": 1.5550000000000002e-05, "loss": 0.3836, "step": 3445 }, { "epoch": 313.2909090909091, "grad_norm": 0.3593761920928955, "learning_rate": 1.554e-05, "loss": 0.3481, "step": 3446 }, { "epoch": 313.3878787878788, "grad_norm": 0.3303178548812866, "learning_rate": 1.553e-05, "loss": 0.3512, "step": 3447 }, { "epoch": 313.4848484848485, "grad_norm": 0.2715561091899872, "learning_rate": 1.552e-05, "loss": 0.3606, "step": 3448 }, { "epoch": 313.58181818181816, "grad_norm": 0.31712865829467773, "learning_rate": 1.551e-05, "loss": 0.3525, "step": 3449 }, { "epoch": 313.6787878787879, "grad_norm": 0.30805760622024536, "learning_rate": 1.55e-05, "loss": 0.3276, "step": 3450 }, { "epoch": 313.6787878787879, "eval_loss": 0.4274803400039673, "eval_runtime": 2.1241, "eval_samples_per_second": 25.894, "eval_steps_per_second": 3.296, "step": 3450 }, { "epoch": 313.77575757575755, "grad_norm": 0.26966550946235657, "learning_rate": 1.5490000000000002e-05, "loss": 0.3137, "step": 3451 }, { "epoch": 313.8727272727273, "grad_norm": 0.2930833399295807, "learning_rate": 1.548e-05, "loss": 0.3352, "step": 3452 }, { "epoch": 313.969696969697, "grad_norm": 0.29453831911087036, "learning_rate": 1.5470000000000003e-05, "loss": 0.3087, "step": 3453 }, { "epoch": 314.0, "grad_norm": 0.4948039948940277, "learning_rate": 1.546e-05, "loss": 0.3371, "step": 3454 }, { "epoch": 314.0969696969697, "grad_norm": 0.29488882422447205, "learning_rate": 1.545e-05, "loss": 0.3489, "step": 3455 }, { "epoch": 314.1939393939394, "grad_norm": 0.2348889857530594, "learning_rate": 1.544e-05, "loss": 0.3255, "step": 3456 }, { "epoch": 314.2909090909091, "grad_norm": 0.3093610405921936, "learning_rate": 1.543e-05, "loss": 0.3239, "step": 3457 }, { "epoch": 314.3878787878788, "grad_norm": 0.30039307475090027, "learning_rate": 1.542e-05, "loss": 0.3257, "step": 3458 }, { "epoch": 314.4848484848485, "grad_norm": 0.3127565085887909, "learning_rate": 1.541e-05, "loss": 0.3782, "step": 3459 }, { "epoch": 314.58181818181816, "grad_norm": 0.2688782513141632, "learning_rate": 1.54e-05, "loss": 0.3159, "step": 3460 }, { "epoch": 314.58181818181816, "eval_loss": 0.4277699291706085, "eval_runtime": 2.1333, "eval_samples_per_second": 25.782, "eval_steps_per_second": 3.281, "step": 3460 }, { "epoch": 314.6787878787879, "grad_norm": 0.2904873192310333, "learning_rate": 1.539e-05, "loss": 0.3127, "step": 3461 }, { "epoch": 314.77575757575755, "grad_norm": 0.34919148683547974, "learning_rate": 1.538e-05, "loss": 0.365, "step": 3462 }, { "epoch": 314.8727272727273, "grad_norm": 0.3132627010345459, "learning_rate": 1.537e-05, "loss": 0.363, "step": 3463 }, { "epoch": 314.969696969697, "grad_norm": 0.30896368622779846, "learning_rate": 1.536e-05, "loss": 0.3534, "step": 3464 }, { "epoch": 315.0, "grad_norm": 0.4962367117404938, "learning_rate": 1.535e-05, "loss": 0.272, "step": 3465 }, { "epoch": 315.0969696969697, "grad_norm": 0.3301921784877777, "learning_rate": 1.5340000000000002e-05, "loss": 0.3477, "step": 3466 }, { "epoch": 315.1939393939394, "grad_norm": 0.26896604895591736, "learning_rate": 1.533e-05, "loss": 0.3477, "step": 3467 }, { "epoch": 315.2909090909091, "grad_norm": 0.28536197543144226, "learning_rate": 1.5320000000000002e-05, "loss": 0.3169, "step": 3468 }, { "epoch": 315.3878787878788, "grad_norm": 0.49175986647605896, "learning_rate": 1.531e-05, "loss": 0.3347, "step": 3469 }, { "epoch": 315.4848484848485, "grad_norm": 0.28688549995422363, "learning_rate": 1.53e-05, "loss": 0.3294, "step": 3470 }, { "epoch": 315.4848484848485, "eval_loss": 0.4271085262298584, "eval_runtime": 2.1397, "eval_samples_per_second": 25.705, "eval_steps_per_second": 3.272, "step": 3470 }, { "epoch": 315.58181818181816, "grad_norm": 0.3667631149291992, "learning_rate": 1.529e-05, "loss": 0.3546, "step": 3471 }, { "epoch": 315.6787878787879, "grad_norm": 0.3338446021080017, "learning_rate": 1.528e-05, "loss": 0.3329, "step": 3472 }, { "epoch": 315.77575757575755, "grad_norm": 0.3099835515022278, "learning_rate": 1.527e-05, "loss": 0.3548, "step": 3473 }, { "epoch": 315.8727272727273, "grad_norm": 0.25447139143943787, "learning_rate": 1.5260000000000003e-05, "loss": 0.3349, "step": 3474 }, { "epoch": 315.969696969697, "grad_norm": 0.3141191899776459, "learning_rate": 1.525e-05, "loss": 0.3448, "step": 3475 }, { "epoch": 316.0, "grad_norm": 0.58322674036026, "learning_rate": 1.5240000000000001e-05, "loss": 0.3171, "step": 3476 }, { "epoch": 316.0969696969697, "grad_norm": 0.26733261346817017, "learning_rate": 1.523e-05, "loss": 0.3089, "step": 3477 }, { "epoch": 316.1939393939394, "grad_norm": 0.28217995166778564, "learning_rate": 1.5220000000000002e-05, "loss": 0.3472, "step": 3478 }, { "epoch": 316.2909090909091, "grad_norm": 0.3269166946411133, "learning_rate": 1.5210000000000002e-05, "loss": 0.3838, "step": 3479 }, { "epoch": 316.3878787878788, "grad_norm": 0.37794098258018494, "learning_rate": 1.52e-05, "loss": 0.3263, "step": 3480 }, { "epoch": 316.3878787878788, "eval_loss": 0.42741304636001587, "eval_runtime": 2.1304, "eval_samples_per_second": 25.816, "eval_steps_per_second": 3.286, "step": 3480 }, { "epoch": 316.4848484848485, "grad_norm": 0.3018639087677002, "learning_rate": 1.5190000000000002e-05, "loss": 0.3679, "step": 3481 }, { "epoch": 316.58181818181816, "grad_norm": 0.2624432444572449, "learning_rate": 1.518e-05, "loss": 0.3288, "step": 3482 }, { "epoch": 316.6787878787879, "grad_norm": 0.2726985216140747, "learning_rate": 1.517e-05, "loss": 0.3474, "step": 3483 }, { "epoch": 316.77575757575755, "grad_norm": 0.25825533270835876, "learning_rate": 1.5160000000000002e-05, "loss": 0.3117, "step": 3484 }, { "epoch": 316.8727272727273, "grad_norm": 0.2738543450832367, "learning_rate": 1.515e-05, "loss": 0.3531, "step": 3485 }, { "epoch": 316.969696969697, "grad_norm": 0.30904558300971985, "learning_rate": 1.514e-05, "loss": 0.3376, "step": 3486 }, { "epoch": 317.0, "grad_norm": 0.49280738830566406, "learning_rate": 1.5129999999999999e-05, "loss": 0.2761, "step": 3487 }, { "epoch": 317.0969696969697, "grad_norm": 0.3254148066043854, "learning_rate": 1.5120000000000001e-05, "loss": 0.3588, "step": 3488 }, { "epoch": 317.1939393939394, "grad_norm": 0.2624494135379791, "learning_rate": 1.5110000000000003e-05, "loss": 0.3318, "step": 3489 }, { "epoch": 317.2909090909091, "grad_norm": 0.29126566648483276, "learning_rate": 1.51e-05, "loss": 0.3851, "step": 3490 }, { "epoch": 317.2909090909091, "eval_loss": 0.42778417468070984, "eval_runtime": 2.1286, "eval_samples_per_second": 25.838, "eval_steps_per_second": 3.288, "step": 3490 }, { "epoch": 317.3878787878788, "grad_norm": 0.2885343134403229, "learning_rate": 1.5090000000000001e-05, "loss": 0.3627, "step": 3491 }, { "epoch": 317.4848484848485, "grad_norm": 0.2681112289428711, "learning_rate": 1.508e-05, "loss": 0.3011, "step": 3492 }, { "epoch": 317.58181818181816, "grad_norm": 0.325305312871933, "learning_rate": 1.5070000000000001e-05, "loss": 0.3809, "step": 3493 }, { "epoch": 317.6787878787879, "grad_norm": 0.34081873297691345, "learning_rate": 1.5060000000000001e-05, "loss": 0.3354, "step": 3494 }, { "epoch": 317.77575757575755, "grad_norm": 0.2977830469608307, "learning_rate": 1.505e-05, "loss": 0.3032, "step": 3495 }, { "epoch": 317.8727272727273, "grad_norm": 0.2674584686756134, "learning_rate": 1.5040000000000002e-05, "loss": 0.3162, "step": 3496 }, { "epoch": 317.969696969697, "grad_norm": 0.4172142446041107, "learning_rate": 1.503e-05, "loss": 0.3079, "step": 3497 }, { "epoch": 318.0, "grad_norm": 0.47455283999443054, "learning_rate": 1.502e-05, "loss": 0.3574, "step": 3498 }, { "epoch": 318.0969696969697, "grad_norm": 0.29889288544654846, "learning_rate": 1.5010000000000002e-05, "loss": 0.3538, "step": 3499 }, { "epoch": 318.1939393939394, "grad_norm": 0.32577717304229736, "learning_rate": 1.5e-05, "loss": 0.288, "step": 3500 }, { "epoch": 318.1939393939394, "eval_loss": 0.42727982997894287, "eval_runtime": 2.1191, "eval_samples_per_second": 25.954, "eval_steps_per_second": 3.303, "step": 3500 }, { "epoch": 318.2909090909091, "grad_norm": 0.2910911440849304, "learning_rate": 1.499e-05, "loss": 0.3426, "step": 3501 }, { "epoch": 318.3878787878788, "grad_norm": 0.3130618929862976, "learning_rate": 1.4979999999999999e-05, "loss": 0.351, "step": 3502 }, { "epoch": 318.4848484848485, "grad_norm": 0.35861071944236755, "learning_rate": 1.497e-05, "loss": 0.3723, "step": 3503 }, { "epoch": 318.58181818181816, "grad_norm": 0.31288236379623413, "learning_rate": 1.4960000000000002e-05, "loss": 0.3183, "step": 3504 }, { "epoch": 318.6787878787879, "grad_norm": 0.2688522934913635, "learning_rate": 1.4950000000000001e-05, "loss": 0.3124, "step": 3505 }, { "epoch": 318.77575757575755, "grad_norm": 0.2826993465423584, "learning_rate": 1.4940000000000001e-05, "loss": 0.3453, "step": 3506 }, { "epoch": 318.8727272727273, "grad_norm": 0.3165665864944458, "learning_rate": 1.493e-05, "loss": 0.3775, "step": 3507 }, { "epoch": 318.969696969697, "grad_norm": 0.2961466610431671, "learning_rate": 1.4920000000000001e-05, "loss": 0.3331, "step": 3508 }, { "epoch": 319.0, "grad_norm": 0.5470519661903381, "learning_rate": 1.4910000000000001e-05, "loss": 0.3091, "step": 3509 }, { "epoch": 319.0969696969697, "grad_norm": 0.28951361775398254, "learning_rate": 1.49e-05, "loss": 0.3426, "step": 3510 }, { "epoch": 319.0969696969697, "eval_loss": 0.4273596704006195, "eval_runtime": 2.116, "eval_samples_per_second": 25.992, "eval_steps_per_second": 3.308, "step": 3510 }, { "epoch": 319.1939393939394, "grad_norm": 0.26769697666168213, "learning_rate": 1.4890000000000001e-05, "loss": 0.3528, "step": 3511 }, { "epoch": 319.2909090909091, "grad_norm": 0.30560117959976196, "learning_rate": 1.488e-05, "loss": 0.3703, "step": 3512 }, { "epoch": 319.3878787878788, "grad_norm": 0.26912692189216614, "learning_rate": 1.487e-05, "loss": 0.336, "step": 3513 }, { "epoch": 319.4848484848485, "grad_norm": 0.2910601794719696, "learning_rate": 1.4860000000000002e-05, "loss": 0.3301, "step": 3514 }, { "epoch": 319.58181818181816, "grad_norm": 0.29013335704803467, "learning_rate": 1.485e-05, "loss": 0.3387, "step": 3515 }, { "epoch": 319.6787878787879, "grad_norm": 0.279678612947464, "learning_rate": 1.4840000000000002e-05, "loss": 0.3256, "step": 3516 }, { "epoch": 319.77575757575755, "grad_norm": 0.2691352367401123, "learning_rate": 1.4829999999999999e-05, "loss": 0.3125, "step": 3517 }, { "epoch": 319.8727272727273, "grad_norm": 0.28071603178977966, "learning_rate": 1.482e-05, "loss": 0.3485, "step": 3518 }, { "epoch": 319.969696969697, "grad_norm": 0.32526275515556335, "learning_rate": 1.4810000000000002e-05, "loss": 0.3243, "step": 3519 }, { "epoch": 320.0, "grad_norm": 0.60272616147995, "learning_rate": 1.48e-05, "loss": 0.3513, "step": 3520 }, { "epoch": 320.0, "eval_loss": 0.42688173055648804, "eval_runtime": 2.129, "eval_samples_per_second": 25.833, "eval_steps_per_second": 3.288, "step": 3520 }, { "epoch": 320.0969696969697, "grad_norm": 0.32412081956863403, "learning_rate": 1.479e-05, "loss": 0.3895, "step": 3521 }, { "epoch": 320.1939393939394, "grad_norm": 0.2637699246406555, "learning_rate": 1.4779999999999999e-05, "loss": 0.3519, "step": 3522 }, { "epoch": 320.2909090909091, "grad_norm": 0.2972037196159363, "learning_rate": 1.4770000000000001e-05, "loss": 0.36, "step": 3523 }, { "epoch": 320.3878787878788, "grad_norm": 0.3079843819141388, "learning_rate": 1.4760000000000001e-05, "loss": 0.3174, "step": 3524 }, { "epoch": 320.4848484848485, "grad_norm": 0.3062552809715271, "learning_rate": 1.475e-05, "loss": 0.3088, "step": 3525 }, { "epoch": 320.58181818181816, "grad_norm": 0.2870745062828064, "learning_rate": 1.4740000000000001e-05, "loss": 0.2884, "step": 3526 }, { "epoch": 320.6787878787879, "grad_norm": 0.2897455096244812, "learning_rate": 1.473e-05, "loss": 0.3331, "step": 3527 }, { "epoch": 320.77575757575755, "grad_norm": 0.29129114747047424, "learning_rate": 1.472e-05, "loss": 0.3259, "step": 3528 }, { "epoch": 320.8727272727273, "grad_norm": 0.29049405455589294, "learning_rate": 1.4710000000000001e-05, "loss": 0.3441, "step": 3529 }, { "epoch": 320.969696969697, "grad_norm": 0.3123665750026703, "learning_rate": 1.47e-05, "loss": 0.3592, "step": 3530 }, { "epoch": 320.969696969697, "eval_loss": 0.42728039622306824, "eval_runtime": 2.1096, "eval_samples_per_second": 26.071, "eval_steps_per_second": 3.318, "step": 3530 }, { "epoch": 321.0, "grad_norm": 0.6494812965393066, "learning_rate": 1.4690000000000002e-05, "loss": 0.3496, "step": 3531 }, { "epoch": 321.0969696969697, "grad_norm": 0.24925000965595245, "learning_rate": 1.4680000000000002e-05, "loss": 0.3694, "step": 3532 }, { "epoch": 321.1939393939394, "grad_norm": 0.283581405878067, "learning_rate": 1.467e-05, "loss": 0.3035, "step": 3533 }, { "epoch": 321.2909090909091, "grad_norm": 0.2899045944213867, "learning_rate": 1.4660000000000002e-05, "loss": 0.3194, "step": 3534 }, { "epoch": 321.3878787878788, "grad_norm": 0.2756378948688507, "learning_rate": 1.465e-05, "loss": 0.3162, "step": 3535 }, { "epoch": 321.4848484848485, "grad_norm": 0.24909161031246185, "learning_rate": 1.464e-05, "loss": 0.3253, "step": 3536 }, { "epoch": 321.58181818181816, "grad_norm": 0.33241987228393555, "learning_rate": 1.4630000000000002e-05, "loss": 0.3408, "step": 3537 }, { "epoch": 321.6787878787879, "grad_norm": 0.33643579483032227, "learning_rate": 1.462e-05, "loss": 0.3442, "step": 3538 }, { "epoch": 321.77575757575755, "grad_norm": 0.24275629222393036, "learning_rate": 1.461e-05, "loss": 0.3182, "step": 3539 }, { "epoch": 321.8727272727273, "grad_norm": 0.3069804310798645, "learning_rate": 1.4599999999999999e-05, "loss": 0.3636, "step": 3540 }, { "epoch": 321.8727272727273, "eval_loss": 0.42699331045150757, "eval_runtime": 2.1078, "eval_samples_per_second": 26.093, "eval_steps_per_second": 3.321, "step": 3540 }, { "epoch": 321.969696969697, "grad_norm": 0.3260021209716797, "learning_rate": 1.4590000000000001e-05, "loss": 0.3606, "step": 3541 }, { "epoch": 322.0, "grad_norm": 0.5835338830947876, "learning_rate": 1.4580000000000003e-05, "loss": 0.3974, "step": 3542 }, { "epoch": 322.0969696969697, "grad_norm": 0.2785661220550537, "learning_rate": 1.4570000000000001e-05, "loss": 0.3258, "step": 3543 }, { "epoch": 322.1939393939394, "grad_norm": 0.2921468913555145, "learning_rate": 1.4560000000000001e-05, "loss": 0.3604, "step": 3544 }, { "epoch": 322.2909090909091, "grad_norm": 0.36530375480651855, "learning_rate": 1.455e-05, "loss": 0.3382, "step": 3545 }, { "epoch": 322.3878787878788, "grad_norm": 0.24066120386123657, "learning_rate": 1.4540000000000001e-05, "loss": 0.2964, "step": 3546 }, { "epoch": 322.4848484848485, "grad_norm": 0.30269855260849, "learning_rate": 1.4530000000000001e-05, "loss": 0.3187, "step": 3547 }, { "epoch": 322.58181818181816, "grad_norm": 0.3029041290283203, "learning_rate": 1.452e-05, "loss": 0.3732, "step": 3548 }, { "epoch": 322.6787878787879, "grad_norm": 0.2999378740787506, "learning_rate": 1.4510000000000002e-05, "loss": 0.3732, "step": 3549 }, { "epoch": 322.77575757575755, "grad_norm": 0.288894385099411, "learning_rate": 1.45e-05, "loss": 0.3201, "step": 3550 }, { "epoch": 322.77575757575755, "eval_loss": 0.4270142912864685, "eval_runtime": 2.1015, "eval_samples_per_second": 26.172, "eval_steps_per_second": 3.331, "step": 3550 }, { "epoch": 322.8727272727273, "grad_norm": 0.2889610230922699, "learning_rate": 1.449e-05, "loss": 0.3297, "step": 3551 }, { "epoch": 322.969696969697, "grad_norm": 0.3216736614704132, "learning_rate": 1.4480000000000002e-05, "loss": 0.3422, "step": 3552 }, { "epoch": 323.0, "grad_norm": 0.5026072263717651, "learning_rate": 1.447e-05, "loss": 0.3391, "step": 3553 }, { "epoch": 323.0969696969697, "grad_norm": 0.4211641252040863, "learning_rate": 1.4460000000000002e-05, "loss": 0.3586, "step": 3554 }, { "epoch": 323.1939393939394, "grad_norm": 0.30118682980537415, "learning_rate": 1.4449999999999999e-05, "loss": 0.332, "step": 3555 }, { "epoch": 323.2909090909091, "grad_norm": 0.25670328736305237, "learning_rate": 1.444e-05, "loss": 0.3582, "step": 3556 }, { "epoch": 323.3878787878788, "grad_norm": 0.3140673339366913, "learning_rate": 1.4430000000000002e-05, "loss": 0.3238, "step": 3557 }, { "epoch": 323.4848484848485, "grad_norm": 0.32594263553619385, "learning_rate": 1.4420000000000001e-05, "loss": 0.3079, "step": 3558 }, { "epoch": 323.58181818181816, "grad_norm": 0.3219808042049408, "learning_rate": 1.4410000000000001e-05, "loss": 0.3602, "step": 3559 }, { "epoch": 323.6787878787879, "grad_norm": 0.27611875534057617, "learning_rate": 1.44e-05, "loss": 0.3265, "step": 3560 }, { "epoch": 323.6787878787879, "eval_loss": 0.4271848499774933, "eval_runtime": 2.1293, "eval_samples_per_second": 25.831, "eval_steps_per_second": 3.288, "step": 3560 }, { "epoch": 323.77575757575755, "grad_norm": 0.29057878255844116, "learning_rate": 1.4390000000000001e-05, "loss": 0.3325, "step": 3561 }, { "epoch": 323.8727272727273, "grad_norm": 0.2801297605037689, "learning_rate": 1.4380000000000001e-05, "loss": 0.3796, "step": 3562 }, { "epoch": 323.969696969697, "grad_norm": 0.3203805983066559, "learning_rate": 1.437e-05, "loss": 0.3119, "step": 3563 }, { "epoch": 324.0, "grad_norm": 0.4999186396598816, "learning_rate": 1.4360000000000001e-05, "loss": 0.3017, "step": 3564 }, { "epoch": 324.0969696969697, "grad_norm": 0.2902086675167084, "learning_rate": 1.435e-05, "loss": 0.2954, "step": 3565 }, { "epoch": 324.1939393939394, "grad_norm": 0.26491281390190125, "learning_rate": 1.434e-05, "loss": 0.3254, "step": 3566 }, { "epoch": 324.2909090909091, "grad_norm": 0.32385510206222534, "learning_rate": 1.4330000000000002e-05, "loss": 0.3507, "step": 3567 }, { "epoch": 324.3878787878788, "grad_norm": 0.2817283570766449, "learning_rate": 1.432e-05, "loss": 0.3183, "step": 3568 }, { "epoch": 324.4848484848485, "grad_norm": 0.32849523425102234, "learning_rate": 1.4310000000000002e-05, "loss": 0.3303, "step": 3569 }, { "epoch": 324.58181818181816, "grad_norm": 0.30101409554481506, "learning_rate": 1.43e-05, "loss": 0.3005, "step": 3570 }, { "epoch": 324.58181818181816, "eval_loss": 0.4270551800727844, "eval_runtime": 2.166, "eval_samples_per_second": 25.393, "eval_steps_per_second": 3.232, "step": 3570 }, { "epoch": 324.6787878787879, "grad_norm": 0.2798803448677063, "learning_rate": 1.429e-05, "loss": 0.3218, "step": 3571 }, { "epoch": 324.77575757575755, "grad_norm": 0.30352485179901123, "learning_rate": 1.4280000000000002e-05, "loss": 0.3478, "step": 3572 }, { "epoch": 324.8727272727273, "grad_norm": 0.344475120306015, "learning_rate": 1.427e-05, "loss": 0.3741, "step": 3573 }, { "epoch": 324.969696969697, "grad_norm": 0.3088480234146118, "learning_rate": 1.426e-05, "loss": 0.3926, "step": 3574 }, { "epoch": 325.0, "grad_norm": 0.5807185173034668, "learning_rate": 1.4249999999999999e-05, "loss": 0.4022, "step": 3575 }, { "epoch": 325.0969696969697, "grad_norm": 0.3358258903026581, "learning_rate": 1.4240000000000001e-05, "loss": 0.2947, "step": 3576 }, { "epoch": 325.1939393939394, "grad_norm": 0.32850319147109985, "learning_rate": 1.4230000000000001e-05, "loss": 0.3275, "step": 3577 }, { "epoch": 325.2909090909091, "grad_norm": 0.33393165469169617, "learning_rate": 1.422e-05, "loss": 0.3291, "step": 3578 }, { "epoch": 325.3878787878788, "grad_norm": 0.2724887430667877, "learning_rate": 1.4210000000000001e-05, "loss": 0.3414, "step": 3579 }, { "epoch": 325.4848484848485, "grad_norm": 0.26683491468429565, "learning_rate": 1.42e-05, "loss": 0.3547, "step": 3580 }, { "epoch": 325.4848484848485, "eval_loss": 0.42706215381622314, "eval_runtime": 2.1441, "eval_samples_per_second": 25.652, "eval_steps_per_second": 3.265, "step": 3580 }, { "epoch": 325.58181818181816, "grad_norm": 0.2737730145454407, "learning_rate": 1.4190000000000001e-05, "loss": 0.343, "step": 3581 }, { "epoch": 325.6787878787879, "grad_norm": 0.3594973087310791, "learning_rate": 1.4180000000000001e-05, "loss": 0.3171, "step": 3582 }, { "epoch": 325.77575757575755, "grad_norm": 0.2571316659450531, "learning_rate": 1.417e-05, "loss": 0.3625, "step": 3583 }, { "epoch": 325.8727272727273, "grad_norm": 0.30839765071868896, "learning_rate": 1.4160000000000002e-05, "loss": 0.3514, "step": 3584 }, { "epoch": 325.969696969697, "grad_norm": 0.3523865044116974, "learning_rate": 1.415e-05, "loss": 0.3502, "step": 3585 }, { "epoch": 326.0, "grad_norm": 0.449184775352478, "learning_rate": 1.414e-05, "loss": 0.3377, "step": 3586 }, { "epoch": 326.0969696969697, "grad_norm": 0.2980722486972809, "learning_rate": 1.4130000000000002e-05, "loss": 0.3289, "step": 3587 }, { "epoch": 326.1939393939394, "grad_norm": 0.2593381702899933, "learning_rate": 1.412e-05, "loss": 0.322, "step": 3588 }, { "epoch": 326.2909090909091, "grad_norm": 0.32586827874183655, "learning_rate": 1.411e-05, "loss": 0.344, "step": 3589 }, { "epoch": 326.3878787878788, "grad_norm": 0.30395492911338806, "learning_rate": 1.4099999999999999e-05, "loss": 0.3395, "step": 3590 }, { "epoch": 326.3878787878788, "eval_loss": 0.4270731210708618, "eval_runtime": 2.1085, "eval_samples_per_second": 26.085, "eval_steps_per_second": 3.32, "step": 3590 }, { "epoch": 326.4848484848485, "grad_norm": 0.25640663504600525, "learning_rate": 1.409e-05, "loss": 0.3299, "step": 3591 }, { "epoch": 326.58181818181816, "grad_norm": 0.28479263186454773, "learning_rate": 1.408e-05, "loss": 0.3439, "step": 3592 }, { "epoch": 326.6787878787879, "grad_norm": 0.3171486556529999, "learning_rate": 1.4069999999999999e-05, "loss": 0.3525, "step": 3593 }, { "epoch": 326.77575757575755, "grad_norm": 0.28105539083480835, "learning_rate": 1.4060000000000001e-05, "loss": 0.3661, "step": 3594 }, { "epoch": 326.8727272727273, "grad_norm": 0.2885766327381134, "learning_rate": 1.4050000000000003e-05, "loss": 0.3232, "step": 3595 }, { "epoch": 326.969696969697, "grad_norm": 0.2773924767971039, "learning_rate": 1.4040000000000001e-05, "loss": 0.3151, "step": 3596 }, { "epoch": 327.0, "grad_norm": 0.5883837938308716, "learning_rate": 1.4030000000000001e-05, "loss": 0.3596, "step": 3597 }, { "epoch": 327.0969696969697, "grad_norm": 0.29262015223503113, "learning_rate": 1.402e-05, "loss": 0.3514, "step": 3598 }, { "epoch": 327.1939393939394, "grad_norm": 0.26522696018218994, "learning_rate": 1.4010000000000001e-05, "loss": 0.3189, "step": 3599 }, { "epoch": 327.2909090909091, "grad_norm": 0.29591551423072815, "learning_rate": 1.4000000000000001e-05, "loss": 0.3355, "step": 3600 }, { "epoch": 327.2909090909091, "eval_loss": 0.42640984058380127, "eval_runtime": 2.1345, "eval_samples_per_second": 25.767, "eval_steps_per_second": 3.279, "step": 3600 }, { "epoch": 327.3878787878788, "grad_norm": 0.2899519205093384, "learning_rate": 1.399e-05, "loss": 0.3344, "step": 3601 }, { "epoch": 327.4848484848485, "grad_norm": 0.31008467078208923, "learning_rate": 1.3980000000000002e-05, "loss": 0.319, "step": 3602 }, { "epoch": 327.58181818181816, "grad_norm": 0.27912476658821106, "learning_rate": 1.397e-05, "loss": 0.3533, "step": 3603 }, { "epoch": 327.6787878787879, "grad_norm": 0.27924835681915283, "learning_rate": 1.396e-05, "loss": 0.359, "step": 3604 }, { "epoch": 327.77575757575755, "grad_norm": 0.28805863857269287, "learning_rate": 1.3950000000000002e-05, "loss": 0.3701, "step": 3605 }, { "epoch": 327.8727272727273, "grad_norm": 0.32875150442123413, "learning_rate": 1.394e-05, "loss": 0.3139, "step": 3606 }, { "epoch": 327.969696969697, "grad_norm": 0.321266233921051, "learning_rate": 1.3930000000000002e-05, "loss": 0.3262, "step": 3607 }, { "epoch": 328.0, "grad_norm": 0.4865063428878784, "learning_rate": 1.3919999999999999e-05, "loss": 0.2978, "step": 3608 }, { "epoch": 328.0969696969697, "grad_norm": 0.3163537085056305, "learning_rate": 1.391e-05, "loss": 0.3324, "step": 3609 }, { "epoch": 328.1939393939394, "grad_norm": 0.28259798884391785, "learning_rate": 1.3900000000000002e-05, "loss": 0.371, "step": 3610 }, { "epoch": 328.1939393939394, "eval_loss": 0.42678430676460266, "eval_runtime": 2.1264, "eval_samples_per_second": 25.865, "eval_steps_per_second": 3.292, "step": 3610 }, { "epoch": 328.2909090909091, "grad_norm": 0.3229399025440216, "learning_rate": 1.389e-05, "loss": 0.3338, "step": 3611 }, { "epoch": 328.3878787878788, "grad_norm": 0.24665595591068268, "learning_rate": 1.3880000000000001e-05, "loss": 0.3561, "step": 3612 }, { "epoch": 328.4848484848485, "grad_norm": 0.3246692717075348, "learning_rate": 1.387e-05, "loss": 0.3467, "step": 3613 }, { "epoch": 328.58181818181816, "grad_norm": 0.24859903752803802, "learning_rate": 1.3860000000000001e-05, "loss": 0.3192, "step": 3614 }, { "epoch": 328.6787878787879, "grad_norm": 0.28727802634239197, "learning_rate": 1.3850000000000001e-05, "loss": 0.3291, "step": 3615 }, { "epoch": 328.77575757575755, "grad_norm": 0.31984421610832214, "learning_rate": 1.384e-05, "loss": 0.3081, "step": 3616 }, { "epoch": 328.8727272727273, "grad_norm": 0.24756306409835815, "learning_rate": 1.3830000000000001e-05, "loss": 0.3564, "step": 3617 }, { "epoch": 328.969696969697, "grad_norm": 0.3125709891319275, "learning_rate": 1.382e-05, "loss": 0.3245, "step": 3618 }, { "epoch": 329.0, "grad_norm": 0.5306656956672668, "learning_rate": 1.381e-05, "loss": 0.304, "step": 3619 }, { "epoch": 329.0969696969697, "grad_norm": 0.3231707811355591, "learning_rate": 1.3800000000000002e-05, "loss": 0.3664, "step": 3620 }, { "epoch": 329.0969696969697, "eval_loss": 0.42667579650878906, "eval_runtime": 2.1187, "eval_samples_per_second": 25.96, "eval_steps_per_second": 3.304, "step": 3620 }, { "epoch": 329.1939393939394, "grad_norm": 0.27657705545425415, "learning_rate": 1.379e-05, "loss": 0.3301, "step": 3621 }, { "epoch": 329.2909090909091, "grad_norm": 0.2769893705844879, "learning_rate": 1.3780000000000002e-05, "loss": 0.3481, "step": 3622 }, { "epoch": 329.3878787878788, "grad_norm": 0.2747114300727844, "learning_rate": 1.377e-05, "loss": 0.3256, "step": 3623 }, { "epoch": 329.4848484848485, "grad_norm": 0.37332674860954285, "learning_rate": 1.376e-05, "loss": 0.3255, "step": 3624 }, { "epoch": 329.58181818181816, "grad_norm": 0.28072360157966614, "learning_rate": 1.3750000000000002e-05, "loss": 0.3399, "step": 3625 }, { "epoch": 329.6787878787879, "grad_norm": 0.3200606405735016, "learning_rate": 1.374e-05, "loss": 0.3075, "step": 3626 }, { "epoch": 329.77575757575755, "grad_norm": 0.3110363483428955, "learning_rate": 1.373e-05, "loss": 0.344, "step": 3627 }, { "epoch": 329.8727272727273, "grad_norm": 0.3103969693183899, "learning_rate": 1.3719999999999999e-05, "loss": 0.3484, "step": 3628 }, { "epoch": 329.969696969697, "grad_norm": 0.3344944417476654, "learning_rate": 1.3710000000000001e-05, "loss": 0.334, "step": 3629 }, { "epoch": 330.0, "grad_norm": 0.5091283321380615, "learning_rate": 1.3700000000000001e-05, "loss": 0.329, "step": 3630 }, { "epoch": 330.0, "eval_loss": 0.42662423849105835, "eval_runtime": 2.1264, "eval_samples_per_second": 25.865, "eval_steps_per_second": 3.292, "step": 3630 }, { "epoch": 330.0969696969697, "grad_norm": 0.2514973282814026, "learning_rate": 1.369e-05, "loss": 0.2958, "step": 3631 }, { "epoch": 330.1939393939394, "grad_norm": 0.295162171125412, "learning_rate": 1.3680000000000001e-05, "loss": 0.3649, "step": 3632 }, { "epoch": 330.2909090909091, "grad_norm": 0.3357464671134949, "learning_rate": 1.367e-05, "loss": 0.3441, "step": 3633 }, { "epoch": 330.3878787878788, "grad_norm": 0.2648813724517822, "learning_rate": 1.3660000000000001e-05, "loss": 0.3225, "step": 3634 }, { "epoch": 330.4848484848485, "grad_norm": 0.3024829924106598, "learning_rate": 1.3650000000000001e-05, "loss": 0.3277, "step": 3635 }, { "epoch": 330.58181818181816, "grad_norm": 0.3073791265487671, "learning_rate": 1.364e-05, "loss": 0.3478, "step": 3636 }, { "epoch": 330.6787878787879, "grad_norm": 0.3012111485004425, "learning_rate": 1.3630000000000002e-05, "loss": 0.3716, "step": 3637 }, { "epoch": 330.77575757575755, "grad_norm": 0.2969381511211395, "learning_rate": 1.362e-05, "loss": 0.3264, "step": 3638 }, { "epoch": 330.8727272727273, "grad_norm": 0.3305814266204834, "learning_rate": 1.361e-05, "loss": 0.3185, "step": 3639 }, { "epoch": 330.969696969697, "grad_norm": 0.29909998178482056, "learning_rate": 1.3600000000000002e-05, "loss": 0.3337, "step": 3640 }, { "epoch": 330.969696969697, "eval_loss": 0.426724910736084, "eval_runtime": 2.1181, "eval_samples_per_second": 25.966, "eval_steps_per_second": 3.305, "step": 3640 }, { "epoch": 331.0, "grad_norm": 0.6468818783760071, "learning_rate": 1.359e-05, "loss": 0.3671, "step": 3641 }, { "epoch": 331.0969696969697, "grad_norm": 0.2924717664718628, "learning_rate": 1.358e-05, "loss": 0.3459, "step": 3642 }, { "epoch": 331.1939393939394, "grad_norm": 0.3029530644416809, "learning_rate": 1.3569999999999999e-05, "loss": 0.3425, "step": 3643 }, { "epoch": 331.2909090909091, "grad_norm": 0.2762484550476074, "learning_rate": 1.356e-05, "loss": 0.3558, "step": 3644 }, { "epoch": 331.3878787878788, "grad_norm": 0.28036603331565857, "learning_rate": 1.3550000000000002e-05, "loss": 0.3028, "step": 3645 }, { "epoch": 331.4848484848485, "grad_norm": 0.2963285446166992, "learning_rate": 1.3539999999999999e-05, "loss": 0.3174, "step": 3646 }, { "epoch": 331.58181818181816, "grad_norm": 0.2616845667362213, "learning_rate": 1.3530000000000001e-05, "loss": 0.347, "step": 3647 }, { "epoch": 331.6787878787879, "grad_norm": 0.30983370542526245, "learning_rate": 1.352e-05, "loss": 0.3538, "step": 3648 }, { "epoch": 331.77575757575755, "grad_norm": 0.28175094723701477, "learning_rate": 1.3510000000000001e-05, "loss": 0.3375, "step": 3649 }, { "epoch": 331.8727272727273, "grad_norm": 0.27941614389419556, "learning_rate": 1.3500000000000001e-05, "loss": 0.3228, "step": 3650 }, { "epoch": 331.8727272727273, "eval_loss": 0.4264315962791443, "eval_runtime": 2.1118, "eval_samples_per_second": 26.044, "eval_steps_per_second": 3.315, "step": 3650 }, { "epoch": 331.969696969697, "grad_norm": 0.28335991501808167, "learning_rate": 1.349e-05, "loss": 0.3275, "step": 3651 }, { "epoch": 332.0, "grad_norm": 0.5670108795166016, "learning_rate": 1.3480000000000001e-05, "loss": 0.3716, "step": 3652 }, { "epoch": 332.0969696969697, "grad_norm": 0.29136744141578674, "learning_rate": 1.347e-05, "loss": 0.3782, "step": 3653 }, { "epoch": 332.1939393939394, "grad_norm": 0.2685430645942688, "learning_rate": 1.346e-05, "loss": 0.3181, "step": 3654 }, { "epoch": 332.2909090909091, "grad_norm": 0.2690100073814392, "learning_rate": 1.3450000000000002e-05, "loss": 0.3499, "step": 3655 }, { "epoch": 332.3878787878788, "grad_norm": 0.28610414266586304, "learning_rate": 1.344e-05, "loss": 0.3198, "step": 3656 }, { "epoch": 332.4848484848485, "grad_norm": 0.3067784905433655, "learning_rate": 1.343e-05, "loss": 0.3223, "step": 3657 }, { "epoch": 332.58181818181816, "grad_norm": 0.27512192726135254, "learning_rate": 1.3420000000000002e-05, "loss": 0.3342, "step": 3658 }, { "epoch": 332.6787878787879, "grad_norm": 0.25142213702201843, "learning_rate": 1.341e-05, "loss": 0.3191, "step": 3659 }, { "epoch": 332.77575757575755, "grad_norm": 0.2903703451156616, "learning_rate": 1.3400000000000002e-05, "loss": 0.3591, "step": 3660 }, { "epoch": 332.77575757575755, "eval_loss": 0.426946222782135, "eval_runtime": 2.1164, "eval_samples_per_second": 25.987, "eval_steps_per_second": 3.307, "step": 3660 }, { "epoch": 332.8727272727273, "grad_norm": 0.2897012531757355, "learning_rate": 1.339e-05, "loss": 0.3465, "step": 3661 }, { "epoch": 332.969696969697, "grad_norm": 0.3455847203731537, "learning_rate": 1.338e-05, "loss": 0.3221, "step": 3662 }, { "epoch": 333.0, "grad_norm": 0.4463295638561249, "learning_rate": 1.3370000000000002e-05, "loss": 0.3094, "step": 3663 }, { "epoch": 333.0969696969697, "grad_norm": 0.2687438130378723, "learning_rate": 1.336e-05, "loss": 0.3238, "step": 3664 }, { "epoch": 333.1939393939394, "grad_norm": 0.27776917815208435, "learning_rate": 1.3350000000000001e-05, "loss": 0.344, "step": 3665 }, { "epoch": 333.2909090909091, "grad_norm": 0.27730217576026917, "learning_rate": 1.334e-05, "loss": 0.3635, "step": 3666 }, { "epoch": 333.3878787878788, "grad_norm": 0.3098108768463135, "learning_rate": 1.3330000000000001e-05, "loss": 0.3259, "step": 3667 }, { "epoch": 333.4848484848485, "grad_norm": 0.2877723276615143, "learning_rate": 1.3320000000000001e-05, "loss": 0.3534, "step": 3668 }, { "epoch": 333.58181818181816, "grad_norm": 0.30429133772850037, "learning_rate": 1.331e-05, "loss": 0.3345, "step": 3669 }, { "epoch": 333.6787878787879, "grad_norm": 0.4659949839115143, "learning_rate": 1.3300000000000001e-05, "loss": 0.3406, "step": 3670 }, { "epoch": 333.6787878787879, "eval_loss": 0.42636099457740784, "eval_runtime": 2.1229, "eval_samples_per_second": 25.908, "eval_steps_per_second": 3.297, "step": 3670 }, { "epoch": 333.77575757575755, "grad_norm": 0.28408676385879517, "learning_rate": 1.329e-05, "loss": 0.3153, "step": 3671 }, { "epoch": 333.8727272727273, "grad_norm": 0.3500078320503235, "learning_rate": 1.3280000000000002e-05, "loss": 0.3392, "step": 3672 }, { "epoch": 333.969696969697, "grad_norm": 0.31569191813468933, "learning_rate": 1.3270000000000002e-05, "loss": 0.313, "step": 3673 }, { "epoch": 334.0, "grad_norm": 0.4757641553878784, "learning_rate": 1.326e-05, "loss": 0.3553, "step": 3674 }, { "epoch": 334.0969696969697, "grad_norm": 0.261425256729126, "learning_rate": 1.3250000000000002e-05, "loss": 0.3573, "step": 3675 }, { "epoch": 334.1939393939394, "grad_norm": 0.3167806565761566, "learning_rate": 1.324e-05, "loss": 0.3145, "step": 3676 }, { "epoch": 334.2909090909091, "grad_norm": 0.2758427858352661, "learning_rate": 1.323e-05, "loss": 0.3121, "step": 3677 }, { "epoch": 334.3878787878788, "grad_norm": 0.32896846532821655, "learning_rate": 1.3220000000000002e-05, "loss": 0.3737, "step": 3678 }, { "epoch": 334.4848484848485, "grad_norm": 0.2690833806991577, "learning_rate": 1.321e-05, "loss": 0.3399, "step": 3679 }, { "epoch": 334.58181818181816, "grad_norm": 0.34167739748954773, "learning_rate": 1.32e-05, "loss": 0.3438, "step": 3680 }, { "epoch": 334.58181818181816, "eval_loss": 0.42669355869293213, "eval_runtime": 2.1225, "eval_samples_per_second": 25.913, "eval_steps_per_second": 3.298, "step": 3680 }, { "epoch": 334.6787878787879, "grad_norm": 0.2956273555755615, "learning_rate": 1.3189999999999999e-05, "loss": 0.3606, "step": 3681 }, { "epoch": 334.77575757575755, "grad_norm": 0.2629927694797516, "learning_rate": 1.3180000000000001e-05, "loss": 0.3291, "step": 3682 }, { "epoch": 334.8727272727273, "grad_norm": 0.2564525008201599, "learning_rate": 1.3170000000000001e-05, "loss": 0.2946, "step": 3683 }, { "epoch": 334.969696969697, "grad_norm": 0.30224695801734924, "learning_rate": 1.316e-05, "loss": 0.3371, "step": 3684 }, { "epoch": 335.0, "grad_norm": 0.430014431476593, "learning_rate": 1.3150000000000001e-05, "loss": 0.3143, "step": 3685 }, { "epoch": 335.0969696969697, "grad_norm": 0.2904576063156128, "learning_rate": 1.314e-05, "loss": 0.3467, "step": 3686 }, { "epoch": 335.1939393939394, "grad_norm": 0.286720335483551, "learning_rate": 1.3130000000000001e-05, "loss": 0.3332, "step": 3687 }, { "epoch": 335.2909090909091, "grad_norm": 0.36974719166755676, "learning_rate": 1.3120000000000001e-05, "loss": 0.3734, "step": 3688 }, { "epoch": 335.3878787878788, "grad_norm": 0.2774631381034851, "learning_rate": 1.311e-05, "loss": 0.3064, "step": 3689 }, { "epoch": 335.4848484848485, "grad_norm": 0.30980628728866577, "learning_rate": 1.3100000000000002e-05, "loss": 0.3694, "step": 3690 }, { "epoch": 335.4848484848485, "eval_loss": 0.4261588752269745, "eval_runtime": 2.1539, "eval_samples_per_second": 25.535, "eval_steps_per_second": 3.25, "step": 3690 }, { "epoch": 335.58181818181816, "grad_norm": 0.2886945605278015, "learning_rate": 1.309e-05, "loss": 0.3054, "step": 3691 }, { "epoch": 335.6787878787879, "grad_norm": 0.285832017660141, "learning_rate": 1.308e-05, "loss": 0.3251, "step": 3692 }, { "epoch": 335.77575757575755, "grad_norm": 0.2936649024486542, "learning_rate": 1.3070000000000002e-05, "loss": 0.3398, "step": 3693 }, { "epoch": 335.8727272727273, "grad_norm": 0.2469436377286911, "learning_rate": 1.306e-05, "loss": 0.3497, "step": 3694 }, { "epoch": 335.969696969697, "grad_norm": 0.2709849178791046, "learning_rate": 1.305e-05, "loss": 0.3155, "step": 3695 }, { "epoch": 336.0, "grad_norm": 0.5284827947616577, "learning_rate": 1.3039999999999999e-05, "loss": 0.3105, "step": 3696 }, { "epoch": 336.0969696969697, "grad_norm": 0.26851266622543335, "learning_rate": 1.303e-05, "loss": 0.3332, "step": 3697 }, { "epoch": 336.1939393939394, "grad_norm": 0.306275337934494, "learning_rate": 1.3020000000000002e-05, "loss": 0.3403, "step": 3698 }, { "epoch": 336.2909090909091, "grad_norm": 0.29274263978004456, "learning_rate": 1.301e-05, "loss": 0.3411, "step": 3699 }, { "epoch": 336.3878787878788, "grad_norm": 0.2938704788684845, "learning_rate": 1.3000000000000001e-05, "loss": 0.3286, "step": 3700 }, { "epoch": 336.3878787878788, "eval_loss": 0.4266641438007355, "eval_runtime": 2.1234, "eval_samples_per_second": 25.902, "eval_steps_per_second": 3.297, "step": 3700 }, { "epoch": 336.4848484848485, "grad_norm": 0.3236421048641205, "learning_rate": 1.299e-05, "loss": 0.3311, "step": 3701 }, { "epoch": 336.58181818181816, "grad_norm": 0.3449302911758423, "learning_rate": 1.2980000000000001e-05, "loss": 0.3351, "step": 3702 }, { "epoch": 336.6787878787879, "grad_norm": 0.2878248989582062, "learning_rate": 1.2970000000000001e-05, "loss": 0.3222, "step": 3703 }, { "epoch": 336.77575757575755, "grad_norm": 0.27355077862739563, "learning_rate": 1.296e-05, "loss": 0.3229, "step": 3704 }, { "epoch": 336.8727272727273, "grad_norm": 0.285319983959198, "learning_rate": 1.2950000000000001e-05, "loss": 0.357, "step": 3705 }, { "epoch": 336.969696969697, "grad_norm": 0.2839474678039551, "learning_rate": 1.294e-05, "loss": 0.3399, "step": 3706 }, { "epoch": 337.0, "grad_norm": 0.42722317576408386, "learning_rate": 1.293e-05, "loss": 0.3444, "step": 3707 }, { "epoch": 337.0969696969697, "grad_norm": 0.26353755593299866, "learning_rate": 1.2920000000000002e-05, "loss": 0.3869, "step": 3708 }, { "epoch": 337.1939393939394, "grad_norm": 0.26299166679382324, "learning_rate": 1.291e-05, "loss": 0.3225, "step": 3709 }, { "epoch": 337.2909090909091, "grad_norm": 0.3076457679271698, "learning_rate": 1.29e-05, "loss": 0.3096, "step": 3710 }, { "epoch": 337.2909090909091, "eval_loss": 0.4259534180164337, "eval_runtime": 2.1593, "eval_samples_per_second": 25.471, "eval_steps_per_second": 3.242, "step": 3710 }, { "epoch": 337.3878787878788, "grad_norm": 0.3430866301059723, "learning_rate": 1.2889999999999999e-05, "loss": 0.3567, "step": 3711 }, { "epoch": 337.4848484848485, "grad_norm": 0.35578009486198425, "learning_rate": 1.288e-05, "loss": 0.2813, "step": 3712 }, { "epoch": 337.58181818181816, "grad_norm": 0.2620140016078949, "learning_rate": 1.2870000000000002e-05, "loss": 0.3425, "step": 3713 }, { "epoch": 337.6787878787879, "grad_norm": 0.32215750217437744, "learning_rate": 1.286e-05, "loss": 0.326, "step": 3714 }, { "epoch": 337.77575757575755, "grad_norm": 0.3033801019191742, "learning_rate": 1.285e-05, "loss": 0.3361, "step": 3715 }, { "epoch": 337.8727272727273, "grad_norm": 0.32087045907974243, "learning_rate": 1.2839999999999999e-05, "loss": 0.3533, "step": 3716 }, { "epoch": 337.969696969697, "grad_norm": 0.3305118680000305, "learning_rate": 1.283e-05, "loss": 0.3324, "step": 3717 }, { "epoch": 338.0, "grad_norm": 0.5343279838562012, "learning_rate": 1.2820000000000001e-05, "loss": 0.3582, "step": 3718 }, { "epoch": 338.0969696969697, "grad_norm": 0.3098539113998413, "learning_rate": 1.281e-05, "loss": 0.3337, "step": 3719 }, { "epoch": 338.1939393939394, "grad_norm": 0.3534613847732544, "learning_rate": 1.2800000000000001e-05, "loss": 0.3418, "step": 3720 }, { "epoch": 338.1939393939394, "eval_loss": 0.42633816599845886, "eval_runtime": 2.1317, "eval_samples_per_second": 25.801, "eval_steps_per_second": 3.284, "step": 3720 }, { "epoch": 338.2909090909091, "grad_norm": 0.27533775568008423, "learning_rate": 1.2790000000000001e-05, "loss": 0.3369, "step": 3721 }, { "epoch": 338.3878787878788, "grad_norm": 0.27337315678596497, "learning_rate": 1.278e-05, "loss": 0.3259, "step": 3722 }, { "epoch": 338.4848484848485, "grad_norm": 0.30268651247024536, "learning_rate": 1.2770000000000001e-05, "loss": 0.3497, "step": 3723 }, { "epoch": 338.58181818181816, "grad_norm": 0.27092444896698, "learning_rate": 1.276e-05, "loss": 0.3558, "step": 3724 }, { "epoch": 338.6787878787879, "grad_norm": 0.2851577401161194, "learning_rate": 1.2750000000000002e-05, "loss": 0.3132, "step": 3725 }, { "epoch": 338.77575757575755, "grad_norm": 0.3387395441532135, "learning_rate": 1.2740000000000002e-05, "loss": 0.3087, "step": 3726 }, { "epoch": 338.8727272727273, "grad_norm": 0.27414950728416443, "learning_rate": 1.273e-05, "loss": 0.3613, "step": 3727 }, { "epoch": 338.969696969697, "grad_norm": 0.2948184609413147, "learning_rate": 1.2720000000000002e-05, "loss": 0.3251, "step": 3728 }, { "epoch": 339.0, "grad_norm": 0.544585645198822, "learning_rate": 1.271e-05, "loss": 0.3472, "step": 3729 }, { "epoch": 339.0969696969697, "grad_norm": 0.2687508463859558, "learning_rate": 1.27e-05, "loss": 0.3414, "step": 3730 }, { "epoch": 339.0969696969697, "eval_loss": 0.4261287748813629, "eval_runtime": 2.1147, "eval_samples_per_second": 26.008, "eval_steps_per_second": 3.31, "step": 3730 }, { "epoch": 339.1939393939394, "grad_norm": 0.3146977722644806, "learning_rate": 1.2690000000000002e-05, "loss": 0.3515, "step": 3731 }, { "epoch": 339.2909090909091, "grad_norm": 0.2693532109260559, "learning_rate": 1.268e-05, "loss": 0.3107, "step": 3732 }, { "epoch": 339.3878787878788, "grad_norm": 0.30518874526023865, "learning_rate": 1.267e-05, "loss": 0.3177, "step": 3733 }, { "epoch": 339.4848484848485, "grad_norm": 0.2727731168270111, "learning_rate": 1.2659999999999999e-05, "loss": 0.3353, "step": 3734 }, { "epoch": 339.58181818181816, "grad_norm": 0.31078311800956726, "learning_rate": 1.2650000000000001e-05, "loss": 0.3035, "step": 3735 }, { "epoch": 339.6787878787879, "grad_norm": 0.2725074589252472, "learning_rate": 1.2640000000000003e-05, "loss": 0.3455, "step": 3736 }, { "epoch": 339.77575757575755, "grad_norm": 0.3046586811542511, "learning_rate": 1.263e-05, "loss": 0.39, "step": 3737 }, { "epoch": 339.8727272727273, "grad_norm": 0.3013319671154022, "learning_rate": 1.2620000000000001e-05, "loss": 0.3466, "step": 3738 }, { "epoch": 339.969696969697, "grad_norm": 0.32089748978614807, "learning_rate": 1.261e-05, "loss": 0.322, "step": 3739 }, { "epoch": 340.0, "grad_norm": 0.5079250335693359, "learning_rate": 1.2600000000000001e-05, "loss": 0.2924, "step": 3740 }, { "epoch": 340.0, "eval_loss": 0.42617207765579224, "eval_runtime": 2.1379, "eval_samples_per_second": 25.726, "eval_steps_per_second": 3.274, "step": 3740 }, { "epoch": 340.0969696969697, "grad_norm": 0.25629541277885437, "learning_rate": 1.2590000000000001e-05, "loss": 0.3203, "step": 3741 }, { "epoch": 340.1939393939394, "grad_norm": 0.25945302844047546, "learning_rate": 1.258e-05, "loss": 0.3363, "step": 3742 }, { "epoch": 340.2909090909091, "grad_norm": 0.28978466987609863, "learning_rate": 1.2570000000000002e-05, "loss": 0.3216, "step": 3743 }, { "epoch": 340.3878787878788, "grad_norm": 0.2984698712825775, "learning_rate": 1.256e-05, "loss": 0.3478, "step": 3744 }, { "epoch": 340.4848484848485, "grad_norm": 0.3027973473072052, "learning_rate": 1.255e-05, "loss": 0.3527, "step": 3745 }, { "epoch": 340.58181818181816, "grad_norm": 0.2580370306968689, "learning_rate": 1.2540000000000002e-05, "loss": 0.3283, "step": 3746 }, { "epoch": 340.6787878787879, "grad_norm": 0.3044896721839905, "learning_rate": 1.253e-05, "loss": 0.3256, "step": 3747 }, { "epoch": 340.77575757575755, "grad_norm": 0.28928688168525696, "learning_rate": 1.252e-05, "loss": 0.3504, "step": 3748 }, { "epoch": 340.8727272727273, "grad_norm": 0.36256060004234314, "learning_rate": 1.2509999999999999e-05, "loss": 0.3382, "step": 3749 }, { "epoch": 340.969696969697, "grad_norm": 0.3101032078266144, "learning_rate": 1.25e-05, "loss": 0.3296, "step": 3750 }, { "epoch": 340.969696969697, "eval_loss": 0.42549657821655273, "eval_runtime": 2.1573, "eval_samples_per_second": 25.495, "eval_steps_per_second": 3.245, "step": 3750 }, { "epoch": 341.0, "grad_norm": 0.590671718120575, "learning_rate": 1.249e-05, "loss": 0.3182, "step": 3751 }, { "epoch": 341.0969696969697, "grad_norm": 0.2854500114917755, "learning_rate": 1.248e-05, "loss": 0.3615, "step": 3752 }, { "epoch": 341.1939393939394, "grad_norm": 0.35217300057411194, "learning_rate": 1.2470000000000001e-05, "loss": 0.3348, "step": 3753 }, { "epoch": 341.2909090909091, "grad_norm": 0.2804906666278839, "learning_rate": 1.2460000000000001e-05, "loss": 0.3354, "step": 3754 }, { "epoch": 341.3878787878788, "grad_norm": 0.25322121381759644, "learning_rate": 1.2450000000000001e-05, "loss": 0.36, "step": 3755 }, { "epoch": 341.4848484848485, "grad_norm": 0.31866592168807983, "learning_rate": 1.244e-05, "loss": 0.3086, "step": 3756 }, { "epoch": 341.58181818181816, "grad_norm": 0.31964659690856934, "learning_rate": 1.243e-05, "loss": 0.3385, "step": 3757 }, { "epoch": 341.6787878787879, "grad_norm": 0.2785675823688507, "learning_rate": 1.2420000000000001e-05, "loss": 0.3326, "step": 3758 }, { "epoch": 341.77575757575755, "grad_norm": 0.2687302231788635, "learning_rate": 1.2410000000000001e-05, "loss": 0.303, "step": 3759 }, { "epoch": 341.8727272727273, "grad_norm": 0.2588405907154083, "learning_rate": 1.24e-05, "loss": 0.3213, "step": 3760 }, { "epoch": 341.8727272727273, "eval_loss": 0.42594873905181885, "eval_runtime": 2.1086, "eval_samples_per_second": 26.084, "eval_steps_per_second": 3.32, "step": 3760 }, { "epoch": 341.969696969697, "grad_norm": 0.3199463486671448, "learning_rate": 1.239e-05, "loss": 0.3517, "step": 3761 }, { "epoch": 342.0, "grad_norm": 0.4935012757778168, "learning_rate": 1.238e-05, "loss": 0.3284, "step": 3762 }, { "epoch": 342.0969696969697, "grad_norm": 0.36274877190589905, "learning_rate": 1.2370000000000002e-05, "loss": 0.3371, "step": 3763 }, { "epoch": 342.1939393939394, "grad_norm": 0.28404876589775085, "learning_rate": 1.236e-05, "loss": 0.3289, "step": 3764 }, { "epoch": 342.2909090909091, "grad_norm": 0.27230966091156006, "learning_rate": 1.235e-05, "loss": 0.3121, "step": 3765 }, { "epoch": 342.3878787878788, "grad_norm": 0.2236277163028717, "learning_rate": 1.234e-05, "loss": 0.3464, "step": 3766 }, { "epoch": 342.4848484848485, "grad_norm": 0.2575564980506897, "learning_rate": 1.233e-05, "loss": 0.319, "step": 3767 }, { "epoch": 342.58181818181816, "grad_norm": 0.3601349890232086, "learning_rate": 1.232e-05, "loss": 0.357, "step": 3768 }, { "epoch": 342.6787878787879, "grad_norm": 0.3346935510635376, "learning_rate": 1.231e-05, "loss": 0.3365, "step": 3769 }, { "epoch": 342.77575757575755, "grad_norm": 0.3269232213497162, "learning_rate": 1.23e-05, "loss": 0.3441, "step": 3770 }, { "epoch": 342.77575757575755, "eval_loss": 0.42581814527511597, "eval_runtime": 2.1304, "eval_samples_per_second": 25.816, "eval_steps_per_second": 3.286, "step": 3770 }, { "epoch": 342.8727272727273, "grad_norm": 0.2760692536830902, "learning_rate": 1.2290000000000001e-05, "loss": 0.3279, "step": 3771 }, { "epoch": 342.969696969697, "grad_norm": 0.3023752272129059, "learning_rate": 1.2280000000000001e-05, "loss": 0.3521, "step": 3772 }, { "epoch": 343.0, "grad_norm": 0.5031427145004272, "learning_rate": 1.2270000000000001e-05, "loss": 0.2737, "step": 3773 }, { "epoch": 343.0969696969697, "grad_norm": 0.2600629925727844, "learning_rate": 1.2260000000000001e-05, "loss": 0.334, "step": 3774 }, { "epoch": 343.1939393939394, "grad_norm": 0.2708774209022522, "learning_rate": 1.225e-05, "loss": 0.3288, "step": 3775 }, { "epoch": 343.2909090909091, "grad_norm": 0.30316439270973206, "learning_rate": 1.224e-05, "loss": 0.3327, "step": 3776 }, { "epoch": 343.3878787878788, "grad_norm": 0.2830888330936432, "learning_rate": 1.2230000000000001e-05, "loss": 0.3221, "step": 3777 }, { "epoch": 343.4848484848485, "grad_norm": 0.33427178859710693, "learning_rate": 1.2220000000000002e-05, "loss": 0.3008, "step": 3778 }, { "epoch": 343.58181818181816, "grad_norm": 0.3453505039215088, "learning_rate": 1.221e-05, "loss": 0.3728, "step": 3779 }, { "epoch": 343.6787878787879, "grad_norm": 0.2651301920413971, "learning_rate": 1.22e-05, "loss": 0.3563, "step": 3780 }, { "epoch": 343.6787878787879, "eval_loss": 0.42604801058769226, "eval_runtime": 2.1237, "eval_samples_per_second": 25.899, "eval_steps_per_second": 3.296, "step": 3780 }, { "epoch": 343.77575757575755, "grad_norm": 0.28830796480178833, "learning_rate": 1.219e-05, "loss": 0.343, "step": 3781 }, { "epoch": 343.8727272727273, "grad_norm": 0.3189815878868103, "learning_rate": 1.2180000000000002e-05, "loss": 0.3287, "step": 3782 }, { "epoch": 343.969696969697, "grad_norm": 0.3367035984992981, "learning_rate": 1.217e-05, "loss": 0.3363, "step": 3783 }, { "epoch": 344.0, "grad_norm": 0.5709712505340576, "learning_rate": 1.216e-05, "loss": 0.2973, "step": 3784 }, { "epoch": 344.0969696969697, "grad_norm": 0.30823367834091187, "learning_rate": 1.215e-05, "loss": 0.305, "step": 3785 }, { "epoch": 344.1939393939394, "grad_norm": 0.32099658250808716, "learning_rate": 1.214e-05, "loss": 0.3073, "step": 3786 }, { "epoch": 344.2909090909091, "grad_norm": 0.25584954023361206, "learning_rate": 1.213e-05, "loss": 0.3347, "step": 3787 }, { "epoch": 344.3878787878788, "grad_norm": 0.3256348669528961, "learning_rate": 1.2120000000000001e-05, "loss": 0.3043, "step": 3788 }, { "epoch": 344.4848484848485, "grad_norm": 0.2968614399433136, "learning_rate": 1.2110000000000001e-05, "loss": 0.3308, "step": 3789 }, { "epoch": 344.58181818181816, "grad_norm": 0.28445330262184143, "learning_rate": 1.2100000000000001e-05, "loss": 0.351, "step": 3790 }, { "epoch": 344.58181818181816, "eval_loss": 0.4258812367916107, "eval_runtime": 2.1339, "eval_samples_per_second": 25.775, "eval_steps_per_second": 3.28, "step": 3790 }, { "epoch": 344.6787878787879, "grad_norm": 0.3036136329174042, "learning_rate": 1.209e-05, "loss": 0.3405, "step": 3791 }, { "epoch": 344.77575757575755, "grad_norm": 0.28134721517562866, "learning_rate": 1.2080000000000001e-05, "loss": 0.3625, "step": 3792 }, { "epoch": 344.8727272727273, "grad_norm": 0.27972412109375, "learning_rate": 1.2070000000000001e-05, "loss": 0.3769, "step": 3793 }, { "epoch": 344.969696969697, "grad_norm": 0.32446789741516113, "learning_rate": 1.206e-05, "loss": 0.3437, "step": 3794 }, { "epoch": 345.0, "grad_norm": 0.47319990396499634, "learning_rate": 1.205e-05, "loss": 0.2866, "step": 3795 }, { "epoch": 345.0969696969697, "grad_norm": 0.2875915765762329, "learning_rate": 1.204e-05, "loss": 0.3453, "step": 3796 }, { "epoch": 345.1939393939394, "grad_norm": 0.2987116575241089, "learning_rate": 1.2030000000000002e-05, "loss": 0.3257, "step": 3797 }, { "epoch": 345.2909090909091, "grad_norm": 0.31415995955467224, "learning_rate": 1.202e-05, "loss": 0.3464, "step": 3798 }, { "epoch": 345.3878787878788, "grad_norm": 0.3293113708496094, "learning_rate": 1.201e-05, "loss": 0.3064, "step": 3799 }, { "epoch": 345.4848484848485, "grad_norm": 0.2928531765937805, "learning_rate": 1.2e-05, "loss": 0.3113, "step": 3800 }, { "epoch": 345.4848484848485, "eval_loss": 0.4257052540779114, "eval_runtime": 2.116, "eval_samples_per_second": 25.992, "eval_steps_per_second": 3.308, "step": 3800 }, { "epoch": 345.58181818181816, "grad_norm": 0.29395541548728943, "learning_rate": 1.199e-05, "loss": 0.3525, "step": 3801 }, { "epoch": 345.6787878787879, "grad_norm": 0.29284462332725525, "learning_rate": 1.198e-05, "loss": 0.3008, "step": 3802 }, { "epoch": 345.77575757575755, "grad_norm": 0.2965705692768097, "learning_rate": 1.197e-05, "loss": 0.3647, "step": 3803 }, { "epoch": 345.8727272727273, "grad_norm": 0.30063632130622864, "learning_rate": 1.196e-05, "loss": 0.3421, "step": 3804 }, { "epoch": 345.969696969697, "grad_norm": 0.2827775478363037, "learning_rate": 1.195e-05, "loss": 0.3481, "step": 3805 }, { "epoch": 346.0, "grad_norm": 0.7182817459106445, "learning_rate": 1.1940000000000001e-05, "loss": 0.3309, "step": 3806 }, { "epoch": 346.0969696969697, "grad_norm": 0.30240872502326965, "learning_rate": 1.1930000000000001e-05, "loss": 0.3205, "step": 3807 }, { "epoch": 346.1939393939394, "grad_norm": 0.2711304724216461, "learning_rate": 1.1920000000000001e-05, "loss": 0.3334, "step": 3808 }, { "epoch": 346.2909090909091, "grad_norm": 0.27359533309936523, "learning_rate": 1.1910000000000001e-05, "loss": 0.3547, "step": 3809 }, { "epoch": 346.3878787878788, "grad_norm": 0.47756561636924744, "learning_rate": 1.19e-05, "loss": 0.3202, "step": 3810 }, { "epoch": 346.3878787878788, "eval_loss": 0.42556026577949524, "eval_runtime": 2.1237, "eval_samples_per_second": 25.898, "eval_steps_per_second": 3.296, "step": 3810 }, { "epoch": 346.4848484848485, "grad_norm": 0.3103063106536865, "learning_rate": 1.1890000000000001e-05, "loss": 0.3288, "step": 3811 }, { "epoch": 346.58181818181816, "grad_norm": 0.3046967387199402, "learning_rate": 1.1880000000000001e-05, "loss": 0.3435, "step": 3812 }, { "epoch": 346.6787878787879, "grad_norm": 0.33921241760253906, "learning_rate": 1.187e-05, "loss": 0.3244, "step": 3813 }, { "epoch": 346.77575757575755, "grad_norm": 0.2951901853084564, "learning_rate": 1.186e-05, "loss": 0.3583, "step": 3814 }, { "epoch": 346.8727272727273, "grad_norm": 0.3240387737751007, "learning_rate": 1.185e-05, "loss": 0.3247, "step": 3815 }, { "epoch": 346.969696969697, "grad_norm": 0.2763834595680237, "learning_rate": 1.1840000000000002e-05, "loss": 0.3465, "step": 3816 }, { "epoch": 347.0, "grad_norm": 0.5004387497901917, "learning_rate": 1.183e-05, "loss": 0.282, "step": 3817 }, { "epoch": 347.0969696969697, "grad_norm": 0.3101780116558075, "learning_rate": 1.182e-05, "loss": 0.3448, "step": 3818 }, { "epoch": 347.1939393939394, "grad_norm": 0.2685920000076294, "learning_rate": 1.181e-05, "loss": 0.3363, "step": 3819 }, { "epoch": 347.2909090909091, "grad_norm": 0.27615267038345337, "learning_rate": 1.18e-05, "loss": 0.328, "step": 3820 }, { "epoch": 347.2909090909091, "eval_loss": 0.42572295665740967, "eval_runtime": 2.12, "eval_samples_per_second": 25.944, "eval_steps_per_second": 3.302, "step": 3820 }, { "epoch": 347.3878787878788, "grad_norm": 0.2806648015975952, "learning_rate": 1.179e-05, "loss": 0.3218, "step": 3821 }, { "epoch": 347.4848484848485, "grad_norm": 0.2721249461174011, "learning_rate": 1.178e-05, "loss": 0.3465, "step": 3822 }, { "epoch": 347.58181818181816, "grad_norm": 0.2903819978237152, "learning_rate": 1.177e-05, "loss": 0.3075, "step": 3823 }, { "epoch": 347.6787878787879, "grad_norm": 0.2980758845806122, "learning_rate": 1.1760000000000001e-05, "loss": 0.3462, "step": 3824 }, { "epoch": 347.77575757575755, "grad_norm": 0.3511853516101837, "learning_rate": 1.175e-05, "loss": 0.3159, "step": 3825 }, { "epoch": 347.8727272727273, "grad_norm": 0.276050865650177, "learning_rate": 1.1740000000000001e-05, "loss": 0.3209, "step": 3826 }, { "epoch": 347.969696969697, "grad_norm": 0.3628363609313965, "learning_rate": 1.1730000000000001e-05, "loss": 0.3595, "step": 3827 }, { "epoch": 348.0, "grad_norm": 0.6746332049369812, "learning_rate": 1.172e-05, "loss": 0.3602, "step": 3828 }, { "epoch": 348.0969696969697, "grad_norm": 0.2941165268421173, "learning_rate": 1.171e-05, "loss": 0.3088, "step": 3829 }, { "epoch": 348.1939393939394, "grad_norm": 0.28681251406669617, "learning_rate": 1.1700000000000001e-05, "loss": 0.3529, "step": 3830 }, { "epoch": 348.1939393939394, "eval_loss": 0.42591407895088196, "eval_runtime": 2.1257, "eval_samples_per_second": 25.873, "eval_steps_per_second": 3.293, "step": 3830 }, { "epoch": 348.2909090909091, "grad_norm": 0.30104222893714905, "learning_rate": 1.1690000000000002e-05, "loss": 0.3544, "step": 3831 }, { "epoch": 348.3878787878788, "grad_norm": 0.27228203415870667, "learning_rate": 1.168e-05, "loss": 0.3113, "step": 3832 }, { "epoch": 348.4848484848485, "grad_norm": 0.2858501374721527, "learning_rate": 1.167e-05, "loss": 0.3268, "step": 3833 }, { "epoch": 348.58181818181816, "grad_norm": 0.2979985475540161, "learning_rate": 1.166e-05, "loss": 0.3642, "step": 3834 }, { "epoch": 348.6787878787879, "grad_norm": 0.29480111598968506, "learning_rate": 1.1650000000000002e-05, "loss": 0.3566, "step": 3835 }, { "epoch": 348.77575757575755, "grad_norm": 0.3030244708061218, "learning_rate": 1.164e-05, "loss": 0.3377, "step": 3836 }, { "epoch": 348.8727272727273, "grad_norm": 0.32541602849960327, "learning_rate": 1.163e-05, "loss": 0.3218, "step": 3837 }, { "epoch": 348.969696969697, "grad_norm": 0.28534772992134094, "learning_rate": 1.162e-05, "loss": 0.3071, "step": 3838 }, { "epoch": 349.0, "grad_norm": 0.5974946022033691, "learning_rate": 1.161e-05, "loss": 0.3073, "step": 3839 }, { "epoch": 349.0969696969697, "grad_norm": 0.31280946731567383, "learning_rate": 1.16e-05, "loss": 0.3439, "step": 3840 }, { "epoch": 349.0969696969697, "eval_loss": 0.42570239305496216, "eval_runtime": 2.1285, "eval_samples_per_second": 25.839, "eval_steps_per_second": 3.289, "step": 3840 }, { "epoch": 349.1939393939394, "grad_norm": 0.29928094148635864, "learning_rate": 1.159e-05, "loss": 0.3078, "step": 3841 }, { "epoch": 349.2909090909091, "grad_norm": 0.35683488845825195, "learning_rate": 1.1580000000000001e-05, "loss": 0.3568, "step": 3842 }, { "epoch": 349.3878787878788, "grad_norm": 0.2896963953971863, "learning_rate": 1.1570000000000001e-05, "loss": 0.356, "step": 3843 }, { "epoch": 349.4848484848485, "grad_norm": 0.24926257133483887, "learning_rate": 1.156e-05, "loss": 0.3207, "step": 3844 }, { "epoch": 349.58181818181816, "grad_norm": 0.27918097376823425, "learning_rate": 1.1550000000000001e-05, "loss": 0.3037, "step": 3845 }, { "epoch": 349.6787878787879, "grad_norm": 0.30067959427833557, "learning_rate": 1.1540000000000001e-05, "loss": 0.3175, "step": 3846 }, { "epoch": 349.77575757575755, "grad_norm": 0.29443076252937317, "learning_rate": 1.153e-05, "loss": 0.305, "step": 3847 }, { "epoch": 349.8727272727273, "grad_norm": 0.3009408712387085, "learning_rate": 1.152e-05, "loss": 0.3323, "step": 3848 }, { "epoch": 349.969696969697, "grad_norm": 0.33346986770629883, "learning_rate": 1.151e-05, "loss": 0.365, "step": 3849 }, { "epoch": 350.0, "grad_norm": 0.5449870824813843, "learning_rate": 1.1500000000000002e-05, "loss": 0.3946, "step": 3850 }, { "epoch": 350.0, "eval_loss": 0.4254421591758728, "eval_runtime": 2.1379, "eval_samples_per_second": 25.727, "eval_steps_per_second": 3.274, "step": 3850 }, { "epoch": 350.0969696969697, "grad_norm": 0.3212931454181671, "learning_rate": 1.149e-05, "loss": 0.3496, "step": 3851 }, { "epoch": 350.1939393939394, "grad_norm": 0.351112961769104, "learning_rate": 1.148e-05, "loss": 0.3197, "step": 3852 }, { "epoch": 350.2909090909091, "grad_norm": 0.29633641242980957, "learning_rate": 1.147e-05, "loss": 0.3099, "step": 3853 }, { "epoch": 350.3878787878788, "grad_norm": 0.31008249521255493, "learning_rate": 1.146e-05, "loss": 0.2962, "step": 3854 }, { "epoch": 350.4848484848485, "grad_norm": 0.31466251611709595, "learning_rate": 1.145e-05, "loss": 0.3429, "step": 3855 }, { "epoch": 350.58181818181816, "grad_norm": 0.27296024560928345, "learning_rate": 1.144e-05, "loss": 0.3601, "step": 3856 }, { "epoch": 350.6787878787879, "grad_norm": 0.2866796553134918, "learning_rate": 1.143e-05, "loss": 0.3148, "step": 3857 }, { "epoch": 350.77575757575755, "grad_norm": 0.3360688388347626, "learning_rate": 1.142e-05, "loss": 0.3429, "step": 3858 }, { "epoch": 350.8727272727273, "grad_norm": 0.2791575789451599, "learning_rate": 1.141e-05, "loss": 0.3234, "step": 3859 }, { "epoch": 350.969696969697, "grad_norm": 0.310596764087677, "learning_rate": 1.1400000000000001e-05, "loss": 0.3686, "step": 3860 }, { "epoch": 350.969696969697, "eval_loss": 0.42575693130493164, "eval_runtime": 2.1423, "eval_samples_per_second": 25.673, "eval_steps_per_second": 3.267, "step": 3860 }, { "epoch": 351.0, "grad_norm": 0.42716941237449646, "learning_rate": 1.1390000000000001e-05, "loss": 0.3444, "step": 3861 }, { "epoch": 351.0969696969697, "grad_norm": 0.28955182433128357, "learning_rate": 1.1380000000000001e-05, "loss": 0.3347, "step": 3862 }, { "epoch": 351.1939393939394, "grad_norm": 0.27090415358543396, "learning_rate": 1.137e-05, "loss": 0.3043, "step": 3863 }, { "epoch": 351.2909090909091, "grad_norm": 0.2701130211353302, "learning_rate": 1.1360000000000001e-05, "loss": 0.3114, "step": 3864 }, { "epoch": 351.3878787878788, "grad_norm": 0.28966429829597473, "learning_rate": 1.1350000000000001e-05, "loss": 0.305, "step": 3865 }, { "epoch": 351.4848484848485, "grad_norm": 0.295930951833725, "learning_rate": 1.134e-05, "loss": 0.3698, "step": 3866 }, { "epoch": 351.58181818181816, "grad_norm": 0.25204113125801086, "learning_rate": 1.133e-05, "loss": 0.3581, "step": 3867 }, { "epoch": 351.6787878787879, "grad_norm": 0.2838861048221588, "learning_rate": 1.132e-05, "loss": 0.325, "step": 3868 }, { "epoch": 351.77575757575755, "grad_norm": 0.3151472210884094, "learning_rate": 1.1310000000000002e-05, "loss": 0.31, "step": 3869 }, { "epoch": 351.8727272727273, "grad_norm": 0.31971457600593567, "learning_rate": 1.13e-05, "loss": 0.3523, "step": 3870 }, { "epoch": 351.8727272727273, "eval_loss": 0.4253227114677429, "eval_runtime": 2.1596, "eval_samples_per_second": 25.467, "eval_steps_per_second": 3.241, "step": 3870 }, { "epoch": 351.969696969697, "grad_norm": 0.32695403695106506, "learning_rate": 1.129e-05, "loss": 0.3688, "step": 3871 }, { "epoch": 352.0, "grad_norm": 0.5059196352958679, "learning_rate": 1.128e-05, "loss": 0.2973, "step": 3872 }, { "epoch": 352.0969696969697, "grad_norm": 0.31515827775001526, "learning_rate": 1.127e-05, "loss": 0.3277, "step": 3873 }, { "epoch": 352.1939393939394, "grad_norm": 0.3012741208076477, "learning_rate": 1.126e-05, "loss": 0.3339, "step": 3874 }, { "epoch": 352.2909090909091, "grad_norm": 0.3248525857925415, "learning_rate": 1.125e-05, "loss": 0.3321, "step": 3875 }, { "epoch": 352.3878787878788, "grad_norm": 0.30005964636802673, "learning_rate": 1.124e-05, "loss": 0.3685, "step": 3876 }, { "epoch": 352.4848484848485, "grad_norm": 0.334306001663208, "learning_rate": 1.1230000000000001e-05, "loss": 0.3317, "step": 3877 }, { "epoch": 352.58181818181816, "grad_norm": 0.2881486415863037, "learning_rate": 1.122e-05, "loss": 0.3174, "step": 3878 }, { "epoch": 352.6787878787879, "grad_norm": 0.2741554379463196, "learning_rate": 1.1210000000000001e-05, "loss": 0.3515, "step": 3879 }, { "epoch": 352.77575757575755, "grad_norm": 0.2765088677406311, "learning_rate": 1.1200000000000001e-05, "loss": 0.3364, "step": 3880 }, { "epoch": 352.77575757575755, "eval_loss": 0.42547720670700073, "eval_runtime": 2.1326, "eval_samples_per_second": 25.79, "eval_steps_per_second": 3.282, "step": 3880 }, { "epoch": 352.8727272727273, "grad_norm": 0.21953384578227997, "learning_rate": 1.1190000000000001e-05, "loss": 0.3198, "step": 3881 }, { "epoch": 352.969696969697, "grad_norm": 0.32624542713165283, "learning_rate": 1.118e-05, "loss": 0.3275, "step": 3882 }, { "epoch": 353.0, "grad_norm": 0.4760756492614746, "learning_rate": 1.117e-05, "loss": 0.2851, "step": 3883 }, { "epoch": 353.0969696969697, "grad_norm": 0.26394760608673096, "learning_rate": 1.1160000000000002e-05, "loss": 0.2947, "step": 3884 }, { "epoch": 353.1939393939394, "grad_norm": 0.31982186436653137, "learning_rate": 1.115e-05, "loss": 0.3338, "step": 3885 }, { "epoch": 353.2909090909091, "grad_norm": 0.2912452220916748, "learning_rate": 1.114e-05, "loss": 0.3035, "step": 3886 }, { "epoch": 353.3878787878788, "grad_norm": 0.28226861357688904, "learning_rate": 1.113e-05, "loss": 0.328, "step": 3887 }, { "epoch": 353.4848484848485, "grad_norm": 0.2831850051879883, "learning_rate": 1.112e-05, "loss": 0.3714, "step": 3888 }, { "epoch": 353.58181818181816, "grad_norm": 0.292229562997818, "learning_rate": 1.111e-05, "loss": 0.3491, "step": 3889 }, { "epoch": 353.6787878787879, "grad_norm": 0.2777029573917389, "learning_rate": 1.11e-05, "loss": 0.3499, "step": 3890 }, { "epoch": 353.6787878787879, "eval_loss": 0.42573267221450806, "eval_runtime": 2.1309, "eval_samples_per_second": 25.811, "eval_steps_per_second": 3.285, "step": 3890 }, { "epoch": 353.77575757575755, "grad_norm": 0.32899153232574463, "learning_rate": 1.109e-05, "loss": 0.3612, "step": 3891 }, { "epoch": 353.8727272727273, "grad_norm": 0.2697356045246124, "learning_rate": 1.108e-05, "loss": 0.3014, "step": 3892 }, { "epoch": 353.969696969697, "grad_norm": 0.26077619194984436, "learning_rate": 1.107e-05, "loss": 0.3224, "step": 3893 }, { "epoch": 354.0, "grad_norm": 0.5170732736587524, "learning_rate": 1.106e-05, "loss": 0.3688, "step": 3894 }, { "epoch": 354.0969696969697, "grad_norm": 0.2870286703109741, "learning_rate": 1.1050000000000001e-05, "loss": 0.3059, "step": 3895 }, { "epoch": 354.1939393939394, "grad_norm": 0.28191274404525757, "learning_rate": 1.1040000000000001e-05, "loss": 0.3329, "step": 3896 }, { "epoch": 354.2909090909091, "grad_norm": 0.2566986382007599, "learning_rate": 1.103e-05, "loss": 0.3068, "step": 3897 }, { "epoch": 354.3878787878788, "grad_norm": 0.2678109109401703, "learning_rate": 1.1020000000000001e-05, "loss": 0.317, "step": 3898 }, { "epoch": 354.4848484848485, "grad_norm": 0.29949188232421875, "learning_rate": 1.1010000000000001e-05, "loss": 0.34, "step": 3899 }, { "epoch": 354.58181818181816, "grad_norm": 0.330269455909729, "learning_rate": 1.1000000000000001e-05, "loss": 0.3302, "step": 3900 }, { "epoch": 354.58181818181816, "eval_loss": 0.4255954623222351, "eval_runtime": 2.1491, "eval_samples_per_second": 25.592, "eval_steps_per_second": 3.257, "step": 3900 }, { "epoch": 354.6787878787879, "grad_norm": 0.30191609263420105, "learning_rate": 1.099e-05, "loss": 0.3181, "step": 3901 }, { "epoch": 354.77575757575755, "grad_norm": 0.2983716130256653, "learning_rate": 1.098e-05, "loss": 0.3562, "step": 3902 }, { "epoch": 354.8727272727273, "grad_norm": 0.329935222864151, "learning_rate": 1.0970000000000002e-05, "loss": 0.3566, "step": 3903 }, { "epoch": 354.969696969697, "grad_norm": 0.30771416425704956, "learning_rate": 1.096e-05, "loss": 0.3578, "step": 3904 }, { "epoch": 355.0, "grad_norm": 0.46148690581321716, "learning_rate": 1.095e-05, "loss": 0.3558, "step": 3905 }, { "epoch": 355.0969696969697, "grad_norm": 0.30684080719947815, "learning_rate": 1.094e-05, "loss": 0.3014, "step": 3906 }, { "epoch": 355.1939393939394, "grad_norm": 0.26615822315216064, "learning_rate": 1.093e-05, "loss": 0.3163, "step": 3907 }, { "epoch": 355.2909090909091, "grad_norm": 0.2488378882408142, "learning_rate": 1.092e-05, "loss": 0.3514, "step": 3908 }, { "epoch": 355.3878787878788, "grad_norm": 0.29284900426864624, "learning_rate": 1.091e-05, "loss": 0.3522, "step": 3909 }, { "epoch": 355.4848484848485, "grad_norm": 0.2902972102165222, "learning_rate": 1.09e-05, "loss": 0.3437, "step": 3910 }, { "epoch": 355.4848484848485, "eval_loss": 0.42546331882476807, "eval_runtime": 2.1237, "eval_samples_per_second": 25.898, "eval_steps_per_second": 3.296, "step": 3910 }, { "epoch": 355.58181818181816, "grad_norm": 0.3538964092731476, "learning_rate": 1.089e-05, "loss": 0.3273, "step": 3911 }, { "epoch": 355.6787878787879, "grad_norm": 0.3419858515262604, "learning_rate": 1.088e-05, "loss": 0.3456, "step": 3912 }, { "epoch": 355.77575757575755, "grad_norm": 0.310893177986145, "learning_rate": 1.0870000000000001e-05, "loss": 0.3529, "step": 3913 }, { "epoch": 355.8727272727273, "grad_norm": 0.2636914551258087, "learning_rate": 1.0860000000000001e-05, "loss": 0.3095, "step": 3914 }, { "epoch": 355.969696969697, "grad_norm": 0.2791852653026581, "learning_rate": 1.0850000000000001e-05, "loss": 0.3287, "step": 3915 }, { "epoch": 356.0, "grad_norm": 0.4313600957393646, "learning_rate": 1.084e-05, "loss": 0.3258, "step": 3916 }, { "epoch": 356.0969696969697, "grad_norm": 0.3125990629196167, "learning_rate": 1.083e-05, "loss": 0.3627, "step": 3917 }, { "epoch": 356.1939393939394, "grad_norm": 0.28080224990844727, "learning_rate": 1.0820000000000001e-05, "loss": 0.348, "step": 3918 }, { "epoch": 356.2909090909091, "grad_norm": 0.27683061361312866, "learning_rate": 1.081e-05, "loss": 0.3389, "step": 3919 }, { "epoch": 356.3878787878788, "grad_norm": 0.509411633014679, "learning_rate": 1.08e-05, "loss": 0.3431, "step": 3920 }, { "epoch": 356.3878787878788, "eval_loss": 0.42507705092430115, "eval_runtime": 2.1365, "eval_samples_per_second": 25.744, "eval_steps_per_second": 3.276, "step": 3920 }, { "epoch": 356.4848484848485, "grad_norm": 0.2736632227897644, "learning_rate": 1.079e-05, "loss": 0.3336, "step": 3921 }, { "epoch": 356.58181818181816, "grad_norm": 0.27927494049072266, "learning_rate": 1.0780000000000002e-05, "loss": 0.3236, "step": 3922 }, { "epoch": 356.6787878787879, "grad_norm": 0.23492373526096344, "learning_rate": 1.077e-05, "loss": 0.3135, "step": 3923 }, { "epoch": 356.77575757575755, "grad_norm": 0.33352339267730713, "learning_rate": 1.076e-05, "loss": 0.2849, "step": 3924 }, { "epoch": 356.8727272727273, "grad_norm": 0.2793315649032593, "learning_rate": 1.075e-05, "loss": 0.3611, "step": 3925 }, { "epoch": 356.969696969697, "grad_norm": 0.29622992873191833, "learning_rate": 1.074e-05, "loss": 0.3215, "step": 3926 }, { "epoch": 357.0, "grad_norm": 0.4465533196926117, "learning_rate": 1.073e-05, "loss": 0.3219, "step": 3927 }, { "epoch": 357.0969696969697, "grad_norm": 0.26654234528541565, "learning_rate": 1.072e-05, "loss": 0.3282, "step": 3928 }, { "epoch": 357.1939393939394, "grad_norm": 0.3026706278324127, "learning_rate": 1.071e-05, "loss": 0.327, "step": 3929 }, { "epoch": 357.2909090909091, "grad_norm": 0.2650451362133026, "learning_rate": 1.0700000000000001e-05, "loss": 0.3291, "step": 3930 }, { "epoch": 357.2909090909091, "eval_loss": 0.42528554797172546, "eval_runtime": 2.1199, "eval_samples_per_second": 25.945, "eval_steps_per_second": 3.302, "step": 3930 }, { "epoch": 357.3878787878788, "grad_norm": 0.26152563095092773, "learning_rate": 1.069e-05, "loss": 0.3328, "step": 3931 }, { "epoch": 357.4848484848485, "grad_norm": 0.30806395411491394, "learning_rate": 1.0680000000000001e-05, "loss": 0.3217, "step": 3932 }, { "epoch": 357.58181818181816, "grad_norm": 0.2408694475889206, "learning_rate": 1.0670000000000001e-05, "loss": 0.3296, "step": 3933 }, { "epoch": 357.6787878787879, "grad_norm": 0.35017138719558716, "learning_rate": 1.0660000000000001e-05, "loss": 0.3301, "step": 3934 }, { "epoch": 357.77575757575755, "grad_norm": 0.26608020067214966, "learning_rate": 1.065e-05, "loss": 0.316, "step": 3935 }, { "epoch": 357.8727272727273, "grad_norm": 0.3791072964668274, "learning_rate": 1.064e-05, "loss": 0.3598, "step": 3936 }, { "epoch": 357.969696969697, "grad_norm": 0.2872272729873657, "learning_rate": 1.0630000000000002e-05, "loss": 0.3669, "step": 3937 }, { "epoch": 358.0, "grad_norm": 0.5474063754081726, "learning_rate": 1.062e-05, "loss": 0.2731, "step": 3938 }, { "epoch": 358.0969696969697, "grad_norm": 0.30581510066986084, "learning_rate": 1.061e-05, "loss": 0.3275, "step": 3939 }, { "epoch": 358.1939393939394, "grad_norm": 0.3368642032146454, "learning_rate": 1.06e-05, "loss": 0.367, "step": 3940 }, { "epoch": 358.1939393939394, "eval_loss": 0.42523324489593506, "eval_runtime": 2.1338, "eval_samples_per_second": 25.776, "eval_steps_per_second": 3.281, "step": 3940 }, { "epoch": 358.2909090909091, "grad_norm": 0.28304001688957214, "learning_rate": 1.059e-05, "loss": 0.3377, "step": 3941 }, { "epoch": 358.3878787878788, "grad_norm": 0.22714465856552124, "learning_rate": 1.058e-05, "loss": 0.3298, "step": 3942 }, { "epoch": 358.4848484848485, "grad_norm": 0.30259737372398376, "learning_rate": 1.057e-05, "loss": 0.3283, "step": 3943 }, { "epoch": 358.58181818181816, "grad_norm": 0.2771393954753876, "learning_rate": 1.056e-05, "loss": 0.3162, "step": 3944 }, { "epoch": 358.6787878787879, "grad_norm": 0.2818312644958496, "learning_rate": 1.055e-05, "loss": 0.344, "step": 3945 }, { "epoch": 358.77575757575755, "grad_norm": 0.26231974363327026, "learning_rate": 1.0539999999999999e-05, "loss": 0.337, "step": 3946 }, { "epoch": 358.8727272727273, "grad_norm": 0.34645920991897583, "learning_rate": 1.053e-05, "loss": 0.3291, "step": 3947 }, { "epoch": 358.969696969697, "grad_norm": 0.29468825459480286, "learning_rate": 1.0520000000000001e-05, "loss": 0.2825, "step": 3948 }, { "epoch": 359.0, "grad_norm": 0.5619099736213684, "learning_rate": 1.0510000000000001e-05, "loss": 0.3934, "step": 3949 }, { "epoch": 359.0969696969697, "grad_norm": 0.258922815322876, "learning_rate": 1.05e-05, "loss": 0.3313, "step": 3950 }, { "epoch": 359.0969696969697, "eval_loss": 0.4250088334083557, "eval_runtime": 2.1484, "eval_samples_per_second": 25.6, "eval_steps_per_second": 3.258, "step": 3950 }, { "epoch": 359.1939393939394, "grad_norm": 0.4014686644077301, "learning_rate": 1.049e-05, "loss": 0.3434, "step": 3951 }, { "epoch": 359.2909090909091, "grad_norm": 0.270025372505188, "learning_rate": 1.0480000000000001e-05, "loss": 0.2918, "step": 3952 }, { "epoch": 359.3878787878788, "grad_norm": 0.31015175580978394, "learning_rate": 1.0470000000000001e-05, "loss": 0.3407, "step": 3953 }, { "epoch": 359.4848484848485, "grad_norm": 0.26703396439552307, "learning_rate": 1.046e-05, "loss": 0.3264, "step": 3954 }, { "epoch": 359.58181818181816, "grad_norm": 0.25800076127052307, "learning_rate": 1.045e-05, "loss": 0.3408, "step": 3955 }, { "epoch": 359.6787878787879, "grad_norm": 0.2913641631603241, "learning_rate": 1.0440000000000002e-05, "loss": 0.3562, "step": 3956 }, { "epoch": 359.77575757575755, "grad_norm": 0.350644052028656, "learning_rate": 1.043e-05, "loss": 0.3556, "step": 3957 }, { "epoch": 359.8727272727273, "grad_norm": 0.2973456084728241, "learning_rate": 1.042e-05, "loss": 0.3085, "step": 3958 }, { "epoch": 359.969696969697, "grad_norm": 0.2788713276386261, "learning_rate": 1.041e-05, "loss": 0.3227, "step": 3959 }, { "epoch": 360.0, "grad_norm": 0.42132866382598877, "learning_rate": 1.04e-05, "loss": 0.3393, "step": 3960 }, { "epoch": 360.0, "eval_loss": 0.42503562569618225, "eval_runtime": 2.1239, "eval_samples_per_second": 25.895, "eval_steps_per_second": 3.296, "step": 3960 }, { "epoch": 360.0969696969697, "grad_norm": 0.2816360890865326, "learning_rate": 1.039e-05, "loss": 0.2806, "step": 3961 }, { "epoch": 360.1939393939394, "grad_norm": 0.26682475209236145, "learning_rate": 1.038e-05, "loss": 0.3402, "step": 3962 }, { "epoch": 360.2909090909091, "grad_norm": 0.2514760494232178, "learning_rate": 1.037e-05, "loss": 0.3163, "step": 3963 }, { "epoch": 360.3878787878788, "grad_norm": 0.299314022064209, "learning_rate": 1.036e-05, "loss": 0.3187, "step": 3964 }, { "epoch": 360.4848484848485, "grad_norm": 0.2835948169231415, "learning_rate": 1.035e-05, "loss": 0.304, "step": 3965 }, { "epoch": 360.58181818181816, "grad_norm": 0.2949691116809845, "learning_rate": 1.0340000000000001e-05, "loss": 0.3801, "step": 3966 }, { "epoch": 360.6787878787879, "grad_norm": 0.30468055605888367, "learning_rate": 1.0330000000000001e-05, "loss": 0.3158, "step": 3967 }, { "epoch": 360.77575757575755, "grad_norm": 0.3264327347278595, "learning_rate": 1.0320000000000001e-05, "loss": 0.3169, "step": 3968 }, { "epoch": 360.8727272727273, "grad_norm": 0.3753027617931366, "learning_rate": 1.031e-05, "loss": 0.3805, "step": 3969 }, { "epoch": 360.969696969697, "grad_norm": 0.31869491934776306, "learning_rate": 1.03e-05, "loss": 0.3675, "step": 3970 }, { "epoch": 360.969696969697, "eval_loss": 0.4247276782989502, "eval_runtime": 2.1286, "eval_samples_per_second": 25.838, "eval_steps_per_second": 3.288, "step": 3970 }, { "epoch": 361.0, "grad_norm": 0.547243595123291, "learning_rate": 1.0290000000000001e-05, "loss": 0.3337, "step": 3971 }, { "epoch": 361.0969696969697, "grad_norm": 0.290824294090271, "learning_rate": 1.0280000000000002e-05, "loss": 0.3522, "step": 3972 }, { "epoch": 361.1939393939394, "grad_norm": 0.24751658737659454, "learning_rate": 1.027e-05, "loss": 0.3332, "step": 3973 }, { "epoch": 361.2909090909091, "grad_norm": 0.2930498421192169, "learning_rate": 1.026e-05, "loss": 0.3102, "step": 3974 }, { "epoch": 361.3878787878788, "grad_norm": 0.28421491384506226, "learning_rate": 1.025e-05, "loss": 0.3381, "step": 3975 }, { "epoch": 361.4848484848485, "grad_norm": 0.31874242424964905, "learning_rate": 1.024e-05, "loss": 0.3196, "step": 3976 }, { "epoch": 361.58181818181816, "grad_norm": 0.3144734501838684, "learning_rate": 1.023e-05, "loss": 0.3567, "step": 3977 }, { "epoch": 361.6787878787879, "grad_norm": 0.27309650182724, "learning_rate": 1.022e-05, "loss": 0.3337, "step": 3978 }, { "epoch": 361.77575757575755, "grad_norm": 0.35673069953918457, "learning_rate": 1.021e-05, "loss": 0.3244, "step": 3979 }, { "epoch": 361.8727272727273, "grad_norm": 0.28269463777542114, "learning_rate": 1.02e-05, "loss": 0.3315, "step": 3980 }, { "epoch": 361.8727272727273, "eval_loss": 0.4251295030117035, "eval_runtime": 2.116, "eval_samples_per_second": 25.992, "eval_steps_per_second": 3.308, "step": 3980 }, { "epoch": 361.969696969697, "grad_norm": 0.26234710216522217, "learning_rate": 1.019e-05, "loss": 0.323, "step": 3981 }, { "epoch": 362.0, "grad_norm": 0.4822295010089874, "learning_rate": 1.018e-05, "loss": 0.3195, "step": 3982 }, { "epoch": 362.0969696969697, "grad_norm": 0.2835442125797272, "learning_rate": 1.0170000000000001e-05, "loss": 0.3206, "step": 3983 }, { "epoch": 362.1939393939394, "grad_norm": 0.3007937967777252, "learning_rate": 1.016e-05, "loss": 0.3322, "step": 3984 }, { "epoch": 362.2909090909091, "grad_norm": 0.27044838666915894, "learning_rate": 1.0150000000000001e-05, "loss": 0.3168, "step": 3985 }, { "epoch": 362.3878787878788, "grad_norm": 0.27905598282814026, "learning_rate": 1.0140000000000001e-05, "loss": 0.3209, "step": 3986 }, { "epoch": 362.4848484848485, "grad_norm": 0.2970101535320282, "learning_rate": 1.0130000000000001e-05, "loss": 0.3481, "step": 3987 }, { "epoch": 362.58181818181816, "grad_norm": 0.27853038907051086, "learning_rate": 1.012e-05, "loss": 0.3228, "step": 3988 }, { "epoch": 362.6787878787879, "grad_norm": 0.32778528332710266, "learning_rate": 1.011e-05, "loss": 0.3385, "step": 3989 }, { "epoch": 362.77575757575755, "grad_norm": 0.3114234507083893, "learning_rate": 1.0100000000000002e-05, "loss": 0.3553, "step": 3990 }, { "epoch": 362.77575757575755, "eval_loss": 0.4250686764717102, "eval_runtime": 2.1338, "eval_samples_per_second": 25.775, "eval_steps_per_second": 3.28, "step": 3990 }, { "epoch": 362.8727272727273, "grad_norm": 0.2816259264945984, "learning_rate": 1.0090000000000002e-05, "loss": 0.3652, "step": 3991 }, { "epoch": 362.969696969697, "grad_norm": 0.2591400146484375, "learning_rate": 1.008e-05, "loss": 0.3156, "step": 3992 }, { "epoch": 363.0, "grad_norm": 0.47758832573890686, "learning_rate": 1.007e-05, "loss": 0.2642, "step": 3993 }, { "epoch": 363.0969696969697, "grad_norm": 0.33999863266944885, "learning_rate": 1.006e-05, "loss": 0.2949, "step": 3994 }, { "epoch": 363.1939393939394, "grad_norm": 0.2757456302642822, "learning_rate": 1.005e-05, "loss": 0.3314, "step": 3995 }, { "epoch": 363.2909090909091, "grad_norm": 0.2781667113304138, "learning_rate": 1.004e-05, "loss": 0.3512, "step": 3996 }, { "epoch": 363.3878787878788, "grad_norm": 0.30516746640205383, "learning_rate": 1.003e-05, "loss": 0.3407, "step": 3997 }, { "epoch": 363.4848484848485, "grad_norm": 0.3394595682621002, "learning_rate": 1.002e-05, "loss": 0.3415, "step": 3998 }, { "epoch": 363.58181818181816, "grad_norm": 0.3109613060951233, "learning_rate": 1.001e-05, "loss": 0.324, "step": 3999 }, { "epoch": 363.6787878787879, "grad_norm": 0.2772013247013092, "learning_rate": 1e-05, "loss": 0.3119, "step": 4000 }, { "epoch": 363.6787878787879, "eval_loss": 0.4250546395778656, "eval_runtime": 2.1465, "eval_samples_per_second": 25.623, "eval_steps_per_second": 3.261, "step": 4000 }, { "epoch": 363.77575757575755, "grad_norm": 0.31099388003349304, "learning_rate": 9.990000000000001e-06, "loss": 0.3787, "step": 4001 }, { "epoch": 363.8727272727273, "grad_norm": 0.29429104924201965, "learning_rate": 9.980000000000001e-06, "loss": 0.3431, "step": 4002 }, { "epoch": 363.969696969697, "grad_norm": 0.27421092987060547, "learning_rate": 9.97e-06, "loss": 0.2933, "step": 4003 }, { "epoch": 364.0, "grad_norm": 0.4427415430545807, "learning_rate": 9.96e-06, "loss": 0.3564, "step": 4004 }, { "epoch": 364.0969696969697, "grad_norm": 0.2868616580963135, "learning_rate": 9.950000000000001e-06, "loss": 0.3189, "step": 4005 }, { "epoch": 364.1939393939394, "grad_norm": 0.30533933639526367, "learning_rate": 9.940000000000001e-06, "loss": 0.3299, "step": 4006 }, { "epoch": 364.2909090909091, "grad_norm": 0.2742302417755127, "learning_rate": 9.93e-06, "loss": 0.3206, "step": 4007 }, { "epoch": 364.3878787878788, "grad_norm": 0.29955291748046875, "learning_rate": 9.92e-06, "loss": 0.3458, "step": 4008 }, { "epoch": 364.4848484848485, "grad_norm": 0.24295401573181152, "learning_rate": 9.91e-06, "loss": 0.3426, "step": 4009 }, { "epoch": 364.58181818181816, "grad_norm": 0.2764863669872284, "learning_rate": 9.900000000000002e-06, "loss": 0.3547, "step": 4010 }, { "epoch": 364.58181818181816, "eval_loss": 0.4247507154941559, "eval_runtime": 2.1081, "eval_samples_per_second": 26.09, "eval_steps_per_second": 3.321, "step": 4010 }, { "epoch": 364.6787878787879, "grad_norm": 0.28245973587036133, "learning_rate": 9.89e-06, "loss": 0.3362, "step": 4011 }, { "epoch": 364.77575757575755, "grad_norm": 0.33747348189353943, "learning_rate": 9.88e-06, "loss": 0.3207, "step": 4012 }, { "epoch": 364.8727272727273, "grad_norm": 0.3213714361190796, "learning_rate": 9.87e-06, "loss": 0.3217, "step": 4013 }, { "epoch": 364.969696969697, "grad_norm": 0.2978791892528534, "learning_rate": 9.86e-06, "loss": 0.3275, "step": 4014 }, { "epoch": 365.0, "grad_norm": 0.5955439805984497, "learning_rate": 9.85e-06, "loss": 0.3338, "step": 4015 }, { "epoch": 365.0969696969697, "grad_norm": 0.25707629323005676, "learning_rate": 9.84e-06, "loss": 0.3284, "step": 4016 }, { "epoch": 365.1939393939394, "grad_norm": 0.33836591243743896, "learning_rate": 9.83e-06, "loss": 0.3375, "step": 4017 }, { "epoch": 365.2909090909091, "grad_norm": 0.2548236846923828, "learning_rate": 9.820000000000001e-06, "loss": 0.329, "step": 4018 }, { "epoch": 365.3878787878788, "grad_norm": 0.30328866839408875, "learning_rate": 9.810000000000001e-06, "loss": 0.3284, "step": 4019 }, { "epoch": 365.4848484848485, "grad_norm": 0.29344213008880615, "learning_rate": 9.800000000000001e-06, "loss": 0.3367, "step": 4020 }, { "epoch": 365.4848484848485, "eval_loss": 0.4250096082687378, "eval_runtime": 2.1186, "eval_samples_per_second": 25.961, "eval_steps_per_second": 3.304, "step": 4020 }, { "epoch": 365.58181818181816, "grad_norm": 0.29405832290649414, "learning_rate": 9.790000000000001e-06, "loss": 0.3655, "step": 4021 }, { "epoch": 365.6787878787879, "grad_norm": 0.31418031454086304, "learning_rate": 9.78e-06, "loss": 0.3249, "step": 4022 }, { "epoch": 365.77575757575755, "grad_norm": 0.3338219225406647, "learning_rate": 9.77e-06, "loss": 0.2812, "step": 4023 }, { "epoch": 365.8727272727273, "grad_norm": 0.26671719551086426, "learning_rate": 9.760000000000001e-06, "loss": 0.3499, "step": 4024 }, { "epoch": 365.969696969697, "grad_norm": 0.32956159114837646, "learning_rate": 9.750000000000002e-06, "loss": 0.3476, "step": 4025 }, { "epoch": 366.0, "grad_norm": 0.4222256541252136, "learning_rate": 9.74e-06, "loss": 0.2755, "step": 4026 }, { "epoch": 366.0969696969697, "grad_norm": 0.27772313356399536, "learning_rate": 9.73e-06, "loss": 0.3148, "step": 4027 }, { "epoch": 366.1939393939394, "grad_norm": 0.32751691341400146, "learning_rate": 9.72e-06, "loss": 0.3628, "step": 4028 }, { "epoch": 366.2909090909091, "grad_norm": 0.3402860462665558, "learning_rate": 9.71e-06, "loss": 0.3789, "step": 4029 }, { "epoch": 366.3878787878788, "grad_norm": 0.2285175919532776, "learning_rate": 9.7e-06, "loss": 0.3331, "step": 4030 }, { "epoch": 366.3878787878788, "eval_loss": 0.42465740442276, "eval_runtime": 2.1169, "eval_samples_per_second": 25.981, "eval_steps_per_second": 3.307, "step": 4030 }, { "epoch": 366.4848484848485, "grad_norm": 0.26843827962875366, "learning_rate": 9.69e-06, "loss": 0.3265, "step": 4031 }, { "epoch": 366.58181818181816, "grad_norm": 0.36523428559303284, "learning_rate": 9.68e-06, "loss": 0.2867, "step": 4032 }, { "epoch": 366.6787878787879, "grad_norm": 0.2948589324951172, "learning_rate": 9.67e-06, "loss": 0.3547, "step": 4033 }, { "epoch": 366.77575757575755, "grad_norm": 0.2663814127445221, "learning_rate": 9.66e-06, "loss": 0.2983, "step": 4034 }, { "epoch": 366.8727272727273, "grad_norm": 0.4233630299568176, "learning_rate": 9.65e-06, "loss": 0.3001, "step": 4035 }, { "epoch": 366.969696969697, "grad_norm": 0.2976456582546234, "learning_rate": 9.640000000000001e-06, "loss": 0.3356, "step": 4036 }, { "epoch": 367.0, "grad_norm": 0.5265571475028992, "learning_rate": 9.630000000000001e-06, "loss": 0.3957, "step": 4037 }, { "epoch": 367.0969696969697, "grad_norm": 0.269848108291626, "learning_rate": 9.62e-06, "loss": 0.349, "step": 4038 }, { "epoch": 367.1939393939394, "grad_norm": 0.3301657438278198, "learning_rate": 9.610000000000001e-06, "loss": 0.3184, "step": 4039 }, { "epoch": 367.2909090909091, "grad_norm": 0.3008776307106018, "learning_rate": 9.600000000000001e-06, "loss": 0.3579, "step": 4040 }, { "epoch": 367.2909090909091, "eval_loss": 0.4249913692474365, "eval_runtime": 2.1317, "eval_samples_per_second": 25.801, "eval_steps_per_second": 3.284, "step": 4040 }, { "epoch": 367.3878787878788, "grad_norm": 0.27042990922927856, "learning_rate": 9.59e-06, "loss": 0.332, "step": 4041 }, { "epoch": 367.4848484848485, "grad_norm": 0.2910686731338501, "learning_rate": 9.58e-06, "loss": 0.306, "step": 4042 }, { "epoch": 367.58181818181816, "grad_norm": 0.32154032588005066, "learning_rate": 9.57e-06, "loss": 0.3121, "step": 4043 }, { "epoch": 367.6787878787879, "grad_norm": 0.303244948387146, "learning_rate": 9.560000000000002e-06, "loss": 0.3221, "step": 4044 }, { "epoch": 367.77575757575755, "grad_norm": 0.292343407869339, "learning_rate": 9.55e-06, "loss": 0.375, "step": 4045 }, { "epoch": 367.8727272727273, "grad_norm": 0.28821298480033875, "learning_rate": 9.54e-06, "loss": 0.3198, "step": 4046 }, { "epoch": 367.969696969697, "grad_norm": 0.3002316653728485, "learning_rate": 9.53e-06, "loss": 0.3319, "step": 4047 }, { "epoch": 368.0, "grad_norm": 0.4897102117538452, "learning_rate": 9.52e-06, "loss": 0.3124, "step": 4048 }, { "epoch": 368.0969696969697, "grad_norm": 0.3233930170536041, "learning_rate": 9.51e-06, "loss": 0.3134, "step": 4049 }, { "epoch": 368.1939393939394, "grad_norm": 0.2641765773296356, "learning_rate": 9.5e-06, "loss": 0.334, "step": 4050 }, { "epoch": 368.1939393939394, "eval_loss": 0.42491304874420166, "eval_runtime": 2.126, "eval_samples_per_second": 25.87, "eval_steps_per_second": 3.292, "step": 4050 }, { "epoch": 368.2909090909091, "grad_norm": 0.27830106019973755, "learning_rate": 9.49e-06, "loss": 0.3238, "step": 4051 }, { "epoch": 368.3878787878788, "grad_norm": 0.25568294525146484, "learning_rate": 9.48e-06, "loss": 0.3427, "step": 4052 }, { "epoch": 368.4848484848485, "grad_norm": 0.3005104660987854, "learning_rate": 9.47e-06, "loss": 0.2992, "step": 4053 }, { "epoch": 368.58181818181816, "grad_norm": 0.2774483263492584, "learning_rate": 9.460000000000001e-06, "loss": 0.3246, "step": 4054 }, { "epoch": 368.6787878787879, "grad_norm": 0.32644858956336975, "learning_rate": 9.450000000000001e-06, "loss": 0.3247, "step": 4055 }, { "epoch": 368.77575757575755, "grad_norm": 0.31656840443611145, "learning_rate": 9.44e-06, "loss": 0.3473, "step": 4056 }, { "epoch": 368.8727272727273, "grad_norm": 0.2856542766094208, "learning_rate": 9.43e-06, "loss": 0.3635, "step": 4057 }, { "epoch": 368.969696969697, "grad_norm": 0.25360551476478577, "learning_rate": 9.420000000000001e-06, "loss": 0.3494, "step": 4058 }, { "epoch": 369.0, "grad_norm": 0.5523640513420105, "learning_rate": 9.410000000000001e-06, "loss": 0.2984, "step": 4059 }, { "epoch": 369.0969696969697, "grad_norm": 0.26677101850509644, "learning_rate": 9.4e-06, "loss": 0.3447, "step": 4060 }, { "epoch": 369.0969696969697, "eval_loss": 0.42472049593925476, "eval_runtime": 2.1255, "eval_samples_per_second": 25.876, "eval_steps_per_second": 3.293, "step": 4060 }, { "epoch": 369.1939393939394, "grad_norm": 0.279538631439209, "learning_rate": 9.39e-06, "loss": 0.307, "step": 4061 }, { "epoch": 369.2909090909091, "grad_norm": 0.30089130997657776, "learning_rate": 9.38e-06, "loss": 0.3473, "step": 4062 }, { "epoch": 369.3878787878788, "grad_norm": 0.2933594286441803, "learning_rate": 9.370000000000002e-06, "loss": 0.3001, "step": 4063 }, { "epoch": 369.4848484848485, "grad_norm": 0.266652911901474, "learning_rate": 9.36e-06, "loss": 0.3355, "step": 4064 }, { "epoch": 369.58181818181816, "grad_norm": 0.25464680790901184, "learning_rate": 9.35e-06, "loss": 0.3101, "step": 4065 }, { "epoch": 369.6787878787879, "grad_norm": 0.2997622489929199, "learning_rate": 9.34e-06, "loss": 0.3191, "step": 4066 }, { "epoch": 369.77575757575755, "grad_norm": 0.3323077857494354, "learning_rate": 9.33e-06, "loss": 0.3544, "step": 4067 }, { "epoch": 369.8727272727273, "grad_norm": 0.29756712913513184, "learning_rate": 9.32e-06, "loss": 0.3342, "step": 4068 }, { "epoch": 369.969696969697, "grad_norm": 0.2877776026725769, "learning_rate": 9.31e-06, "loss": 0.3577, "step": 4069 }, { "epoch": 370.0, "grad_norm": 0.5464515686035156, "learning_rate": 9.3e-06, "loss": 0.328, "step": 4070 }, { "epoch": 370.0, "eval_loss": 0.4247572720050812, "eval_runtime": 2.1217, "eval_samples_per_second": 25.922, "eval_steps_per_second": 3.299, "step": 4070 }, { "epoch": 370.0969696969697, "grad_norm": 0.2983577847480774, "learning_rate": 9.29e-06, "loss": 0.3391, "step": 4071 }, { "epoch": 370.1939393939394, "grad_norm": 0.2986522614955902, "learning_rate": 9.28e-06, "loss": 0.3325, "step": 4072 }, { "epoch": 370.2909090909091, "grad_norm": 0.27389615774154663, "learning_rate": 9.270000000000001e-06, "loss": 0.3639, "step": 4073 }, { "epoch": 370.3878787878788, "grad_norm": 0.2886711061000824, "learning_rate": 9.260000000000001e-06, "loss": 0.3264, "step": 4074 }, { "epoch": 370.4848484848485, "grad_norm": 0.30563750863075256, "learning_rate": 9.25e-06, "loss": 0.3075, "step": 4075 }, { "epoch": 370.58181818181816, "grad_norm": 0.3458840548992157, "learning_rate": 9.24e-06, "loss": 0.3191, "step": 4076 }, { "epoch": 370.6787878787879, "grad_norm": 0.4566315710544586, "learning_rate": 9.23e-06, "loss": 0.3237, "step": 4077 }, { "epoch": 370.77575757575755, "grad_norm": 0.298952579498291, "learning_rate": 9.220000000000002e-06, "loss": 0.3557, "step": 4078 }, { "epoch": 370.8727272727273, "grad_norm": 0.2589184045791626, "learning_rate": 9.21e-06, "loss": 0.3147, "step": 4079 }, { "epoch": 370.969696969697, "grad_norm": 0.3013869822025299, "learning_rate": 9.2e-06, "loss": 0.3389, "step": 4080 }, { "epoch": 370.969696969697, "eval_loss": 0.42480072379112244, "eval_runtime": 2.1089, "eval_samples_per_second": 26.079, "eval_steps_per_second": 3.319, "step": 4080 }, { "epoch": 371.0, "grad_norm": 0.5611611604690552, "learning_rate": 9.19e-06, "loss": 0.3006, "step": 4081 }, { "epoch": 371.0969696969697, "grad_norm": 0.28300800919532776, "learning_rate": 9.180000000000002e-06, "loss": 0.3446, "step": 4082 }, { "epoch": 371.1939393939394, "grad_norm": 0.2626231908798218, "learning_rate": 9.17e-06, "loss": 0.3481, "step": 4083 }, { "epoch": 371.2909090909091, "grad_norm": 0.2636497914791107, "learning_rate": 9.16e-06, "loss": 0.3083, "step": 4084 }, { "epoch": 371.3878787878788, "grad_norm": 0.3330760598182678, "learning_rate": 9.15e-06, "loss": 0.3171, "step": 4085 }, { "epoch": 371.4848484848485, "grad_norm": 0.25505438446998596, "learning_rate": 9.14e-06, "loss": 0.3434, "step": 4086 }, { "epoch": 371.58181818181816, "grad_norm": 0.3272264003753662, "learning_rate": 9.13e-06, "loss": 0.2918, "step": 4087 }, { "epoch": 371.6787878787879, "grad_norm": 0.30581533908843994, "learning_rate": 9.12e-06, "loss": 0.3406, "step": 4088 }, { "epoch": 371.77575757575755, "grad_norm": 0.28679990768432617, "learning_rate": 9.110000000000001e-06, "loss": 0.345, "step": 4089 }, { "epoch": 371.8727272727273, "grad_norm": 0.46133214235305786, "learning_rate": 9.100000000000001e-06, "loss": 0.3456, "step": 4090 }, { "epoch": 371.8727272727273, "eval_loss": 0.4248599708080292, "eval_runtime": 2.1296, "eval_samples_per_second": 25.826, "eval_steps_per_second": 3.287, "step": 4090 }, { "epoch": 371.969696969697, "grad_norm": 0.2853439748287201, "learning_rate": 9.09e-06, "loss": 0.3206, "step": 4091 }, { "epoch": 372.0, "grad_norm": 0.548027515411377, "learning_rate": 9.080000000000001e-06, "loss": 0.3337, "step": 4092 }, { "epoch": 372.0969696969697, "grad_norm": 0.2653498351573944, "learning_rate": 9.070000000000001e-06, "loss": 0.3608, "step": 4093 }, { "epoch": 372.1939393939394, "grad_norm": 0.27974510192871094, "learning_rate": 9.06e-06, "loss": 0.3155, "step": 4094 }, { "epoch": 372.2909090909091, "grad_norm": 0.26262444257736206, "learning_rate": 9.05e-06, "loss": 0.3314, "step": 4095 }, { "epoch": 372.3878787878788, "grad_norm": 0.2681049704551697, "learning_rate": 9.04e-06, "loss": 0.2963, "step": 4096 }, { "epoch": 372.4848484848485, "grad_norm": 0.3194270730018616, "learning_rate": 9.030000000000002e-06, "loss": 0.2966, "step": 4097 }, { "epoch": 372.58181818181816, "grad_norm": 0.29719874262809753, "learning_rate": 9.02e-06, "loss": 0.3365, "step": 4098 }, { "epoch": 372.6787878787879, "grad_norm": 0.2969561815261841, "learning_rate": 9.01e-06, "loss": 0.354, "step": 4099 }, { "epoch": 372.77575757575755, "grad_norm": 0.31575682759284973, "learning_rate": 9e-06, "loss": 0.3704, "step": 4100 }, { "epoch": 372.77575757575755, "eval_loss": 0.4247245192527771, "eval_runtime": 2.1174, "eval_samples_per_second": 25.976, "eval_steps_per_second": 3.306, "step": 4100 }, { "epoch": 372.8727272727273, "grad_norm": 0.3249724507331848, "learning_rate": 8.99e-06, "loss": 0.311, "step": 4101 }, { "epoch": 372.969696969697, "grad_norm": 0.285019189119339, "learning_rate": 8.98e-06, "loss": 0.3534, "step": 4102 }, { "epoch": 373.0, "grad_norm": 0.617175281047821, "learning_rate": 8.97e-06, "loss": 0.2668, "step": 4103 }, { "epoch": 373.0969696969697, "grad_norm": 0.3166021406650543, "learning_rate": 8.96e-06, "loss": 0.3533, "step": 4104 }, { "epoch": 373.1939393939394, "grad_norm": 0.3377653658390045, "learning_rate": 8.95e-06, "loss": 0.3174, "step": 4105 }, { "epoch": 373.2909090909091, "grad_norm": 0.2525980472564697, "learning_rate": 8.939999999999999e-06, "loss": 0.3036, "step": 4106 }, { "epoch": 373.3878787878788, "grad_norm": 0.30190375447273254, "learning_rate": 8.930000000000001e-06, "loss": 0.3661, "step": 4107 }, { "epoch": 373.4848484848485, "grad_norm": 0.32537418603897095, "learning_rate": 8.920000000000001e-06, "loss": 0.3355, "step": 4108 }, { "epoch": 373.58181818181816, "grad_norm": 0.27809938788414, "learning_rate": 8.910000000000001e-06, "loss": 0.3138, "step": 4109 }, { "epoch": 373.6787878787879, "grad_norm": 0.298795610666275, "learning_rate": 8.9e-06, "loss": 0.3375, "step": 4110 }, { "epoch": 373.6787878787879, "eval_loss": 0.4250388443470001, "eval_runtime": 2.1164, "eval_samples_per_second": 25.988, "eval_steps_per_second": 3.308, "step": 4110 }, { "epoch": 373.77575757575755, "grad_norm": 0.32072368264198303, "learning_rate": 8.890000000000001e-06, "loss": 0.3474, "step": 4111 }, { "epoch": 373.8727272727273, "grad_norm": 0.3273388743400574, "learning_rate": 8.880000000000001e-06, "loss": 0.3195, "step": 4112 }, { "epoch": 373.969696969697, "grad_norm": 0.3104739785194397, "learning_rate": 8.87e-06, "loss": 0.2922, "step": 4113 }, { "epoch": 374.0, "grad_norm": 0.4112694263458252, "learning_rate": 8.86e-06, "loss": 0.3919, "step": 4114 }, { "epoch": 374.0969696969697, "grad_norm": 0.2891893982887268, "learning_rate": 8.85e-06, "loss": 0.384, "step": 4115 }, { "epoch": 374.1939393939394, "grad_norm": 0.2470197081565857, "learning_rate": 8.840000000000002e-06, "loss": 0.3046, "step": 4116 }, { "epoch": 374.2909090909091, "grad_norm": 0.24681566655635834, "learning_rate": 8.83e-06, "loss": 0.3291, "step": 4117 }, { "epoch": 374.3878787878788, "grad_norm": 0.3531782627105713, "learning_rate": 8.82e-06, "loss": 0.3499, "step": 4118 }, { "epoch": 374.4848484848485, "grad_norm": 0.31936323642730713, "learning_rate": 8.81e-06, "loss": 0.31, "step": 4119 }, { "epoch": 374.58181818181816, "grad_norm": 0.2687115967273712, "learning_rate": 8.8e-06, "loss": 0.3571, "step": 4120 }, { "epoch": 374.58181818181816, "eval_loss": 0.42443034052848816, "eval_runtime": 2.1256, "eval_samples_per_second": 25.875, "eval_steps_per_second": 3.293, "step": 4120 }, { "epoch": 374.6787878787879, "grad_norm": 0.345988392829895, "learning_rate": 8.79e-06, "loss": 0.343, "step": 4121 }, { "epoch": 374.77575757575755, "grad_norm": 0.3371879458427429, "learning_rate": 8.78e-06, "loss": 0.3026, "step": 4122 }, { "epoch": 374.8727272727273, "grad_norm": 0.27523255348205566, "learning_rate": 8.77e-06, "loss": 0.2955, "step": 4123 }, { "epoch": 374.969696969697, "grad_norm": 0.3101111054420471, "learning_rate": 8.76e-06, "loss": 0.3227, "step": 4124 }, { "epoch": 375.0, "grad_norm": 0.44706135988235474, "learning_rate": 8.75e-06, "loss": 0.3418, "step": 4125 }, { "epoch": 375.0969696969697, "grad_norm": 0.3133430480957031, "learning_rate": 8.740000000000001e-06, "loss": 0.3404, "step": 4126 }, { "epoch": 375.1939393939394, "grad_norm": 0.2988424599170685, "learning_rate": 8.730000000000001e-06, "loss": 0.3702, "step": 4127 }, { "epoch": 375.2909090909091, "grad_norm": 0.28647005558013916, "learning_rate": 8.720000000000001e-06, "loss": 0.3317, "step": 4128 }, { "epoch": 375.3878787878788, "grad_norm": 0.29622405767440796, "learning_rate": 8.71e-06, "loss": 0.3167, "step": 4129 }, { "epoch": 375.4848484848485, "grad_norm": 0.2939758002758026, "learning_rate": 8.7e-06, "loss": 0.3244, "step": 4130 }, { "epoch": 375.4848484848485, "eval_loss": 0.4248404800891876, "eval_runtime": 2.1331, "eval_samples_per_second": 25.784, "eval_steps_per_second": 3.282, "step": 4130 }, { "epoch": 375.58181818181816, "grad_norm": 0.25706255435943604, "learning_rate": 8.690000000000002e-06, "loss": 0.337, "step": 4131 }, { "epoch": 375.6787878787879, "grad_norm": 0.30878663063049316, "learning_rate": 8.68e-06, "loss": 0.2789, "step": 4132 }, { "epoch": 375.77575757575755, "grad_norm": 0.29647648334503174, "learning_rate": 8.67e-06, "loss": 0.348, "step": 4133 }, { "epoch": 375.8727272727273, "grad_norm": 0.31073883175849915, "learning_rate": 8.66e-06, "loss": 0.2972, "step": 4134 }, { "epoch": 375.969696969697, "grad_norm": 0.3301756978034973, "learning_rate": 8.65e-06, "loss": 0.3553, "step": 4135 }, { "epoch": 376.0, "grad_norm": 0.5006003975868225, "learning_rate": 8.64e-06, "loss": 0.347, "step": 4136 }, { "epoch": 376.0969696969697, "grad_norm": 0.2915913462638855, "learning_rate": 8.63e-06, "loss": 0.3644, "step": 4137 }, { "epoch": 376.1939393939394, "grad_norm": 0.26376426219940186, "learning_rate": 8.62e-06, "loss": 0.3347, "step": 4138 }, { "epoch": 376.2909090909091, "grad_norm": 0.31158575415611267, "learning_rate": 8.61e-06, "loss": 0.3291, "step": 4139 }, { "epoch": 376.3878787878788, "grad_norm": 0.2940891981124878, "learning_rate": 8.599999999999999e-06, "loss": 0.328, "step": 4140 }, { "epoch": 376.3878787878788, "eval_loss": 0.424710214138031, "eval_runtime": 2.1258, "eval_samples_per_second": 25.873, "eval_steps_per_second": 3.293, "step": 4140 }, { "epoch": 376.4848484848485, "grad_norm": 0.3219124376773834, "learning_rate": 8.59e-06, "loss": 0.3244, "step": 4141 }, { "epoch": 376.58181818181816, "grad_norm": 0.23037515580654144, "learning_rate": 8.580000000000001e-06, "loss": 0.3396, "step": 4142 }, { "epoch": 376.6787878787879, "grad_norm": 0.2835302948951721, "learning_rate": 8.570000000000001e-06, "loss": 0.3036, "step": 4143 }, { "epoch": 376.77575757575755, "grad_norm": 0.30571696162223816, "learning_rate": 8.56e-06, "loss": 0.3398, "step": 4144 }, { "epoch": 376.8727272727273, "grad_norm": 0.3565056622028351, "learning_rate": 8.550000000000001e-06, "loss": 0.3289, "step": 4145 }, { "epoch": 376.969696969697, "grad_norm": 0.30703380703926086, "learning_rate": 8.540000000000001e-06, "loss": 0.307, "step": 4146 }, { "epoch": 377.0, "grad_norm": 0.5746144652366638, "learning_rate": 8.53e-06, "loss": 0.3417, "step": 4147 }, { "epoch": 377.0969696969697, "grad_norm": 0.31260401010513306, "learning_rate": 8.52e-06, "loss": 0.3397, "step": 4148 }, { "epoch": 377.1939393939394, "grad_norm": 0.26729366183280945, "learning_rate": 8.51e-06, "loss": 0.324, "step": 4149 }, { "epoch": 377.2909090909091, "grad_norm": 0.31326571106910706, "learning_rate": 8.500000000000002e-06, "loss": 0.3467, "step": 4150 }, { "epoch": 377.2909090909091, "eval_loss": 0.42445996403694153, "eval_runtime": 2.1611, "eval_samples_per_second": 25.45, "eval_steps_per_second": 3.239, "step": 4150 }, { "epoch": 377.3878787878788, "grad_norm": 0.31573301553726196, "learning_rate": 8.49e-06, "loss": 0.3165, "step": 4151 }, { "epoch": 377.4848484848485, "grad_norm": 0.2820153534412384, "learning_rate": 8.48e-06, "loss": 0.3455, "step": 4152 }, { "epoch": 377.58181818181816, "grad_norm": 0.3372909128665924, "learning_rate": 8.47e-06, "loss": 0.3157, "step": 4153 }, { "epoch": 377.6787878787879, "grad_norm": 0.3248935043811798, "learning_rate": 8.46e-06, "loss": 0.3208, "step": 4154 }, { "epoch": 377.77575757575755, "grad_norm": 0.3556617796421051, "learning_rate": 8.45e-06, "loss": 0.3412, "step": 4155 }, { "epoch": 377.8727272727273, "grad_norm": 0.29159489274024963, "learning_rate": 8.44e-06, "loss": 0.2932, "step": 4156 }, { "epoch": 377.969696969697, "grad_norm": 0.27935653924942017, "learning_rate": 8.43e-06, "loss": 0.3583, "step": 4157 }, { "epoch": 378.0, "grad_norm": 0.45205578207969666, "learning_rate": 8.42e-06, "loss": 0.3296, "step": 4158 }, { "epoch": 378.0969696969697, "grad_norm": 0.349948525428772, "learning_rate": 8.409999999999999e-06, "loss": 0.3178, "step": 4159 }, { "epoch": 378.1939393939394, "grad_norm": 0.305345356464386, "learning_rate": 8.400000000000001e-06, "loss": 0.3089, "step": 4160 }, { "epoch": 378.1939393939394, "eval_loss": 0.4246861934661865, "eval_runtime": 2.1364, "eval_samples_per_second": 25.744, "eval_steps_per_second": 3.277, "step": 4160 }, { "epoch": 378.2909090909091, "grad_norm": 0.3020115792751312, "learning_rate": 8.390000000000001e-06, "loss": 0.3357, "step": 4161 }, { "epoch": 378.3878787878788, "grad_norm": 0.24768730998039246, "learning_rate": 8.380000000000001e-06, "loss": 0.2948, "step": 4162 }, { "epoch": 378.4848484848485, "grad_norm": 0.2680698037147522, "learning_rate": 8.37e-06, "loss": 0.3254, "step": 4163 }, { "epoch": 378.58181818181816, "grad_norm": 0.2712077796459198, "learning_rate": 8.36e-06, "loss": 0.3501, "step": 4164 }, { "epoch": 378.6787878787879, "grad_norm": 0.30424049496650696, "learning_rate": 8.350000000000001e-06, "loss": 0.2979, "step": 4165 }, { "epoch": 378.77575757575755, "grad_norm": 0.2615557312965393, "learning_rate": 8.34e-06, "loss": 0.3759, "step": 4166 }, { "epoch": 378.8727272727273, "grad_norm": 0.2612929940223694, "learning_rate": 8.33e-06, "loss": 0.3204, "step": 4167 }, { "epoch": 378.969696969697, "grad_norm": 0.34027451276779175, "learning_rate": 8.32e-06, "loss": 0.3751, "step": 4168 }, { "epoch": 379.0, "grad_norm": 0.4507916569709778, "learning_rate": 8.31e-06, "loss": 0.306, "step": 4169 }, { "epoch": 379.0969696969697, "grad_norm": 0.3092374801635742, "learning_rate": 8.3e-06, "loss": 0.333, "step": 4170 }, { "epoch": 379.0969696969697, "eval_loss": 0.4247243404388428, "eval_runtime": 2.1965, "eval_samples_per_second": 25.04, "eval_steps_per_second": 3.187, "step": 4170 }, { "epoch": 379.1939393939394, "grad_norm": 0.264586478471756, "learning_rate": 8.29e-06, "loss": 0.3343, "step": 4171 }, { "epoch": 379.2909090909091, "grad_norm": 0.3064897060394287, "learning_rate": 8.28e-06, "loss": 0.3234, "step": 4172 }, { "epoch": 379.3878787878788, "grad_norm": 0.33171793818473816, "learning_rate": 8.27e-06, "loss": 0.3462, "step": 4173 }, { "epoch": 379.4848484848485, "grad_norm": 0.28818413615226746, "learning_rate": 8.26e-06, "loss": 0.3108, "step": 4174 }, { "epoch": 379.58181818181816, "grad_norm": 0.3123342990875244, "learning_rate": 8.25e-06, "loss": 0.3585, "step": 4175 }, { "epoch": 379.6787878787879, "grad_norm": 0.3093242347240448, "learning_rate": 8.24e-06, "loss": 0.3032, "step": 4176 }, { "epoch": 379.77575757575755, "grad_norm": 0.28044188022613525, "learning_rate": 8.23e-06, "loss": 0.3217, "step": 4177 }, { "epoch": 379.8727272727273, "grad_norm": 0.3285064399242401, "learning_rate": 8.22e-06, "loss": 0.3583, "step": 4178 }, { "epoch": 379.969696969697, "grad_norm": 0.29143503308296204, "learning_rate": 8.210000000000001e-06, "loss": 0.3071, "step": 4179 }, { "epoch": 380.0, "grad_norm": 0.5048023462295532, "learning_rate": 8.200000000000001e-06, "loss": 0.3354, "step": 4180 }, { "epoch": 380.0, "eval_loss": 0.4245156943798065, "eval_runtime": 2.1402, "eval_samples_per_second": 25.698, "eval_steps_per_second": 3.271, "step": 4180 }, { "epoch": 380.0969696969697, "grad_norm": 0.269228458404541, "learning_rate": 8.190000000000001e-06, "loss": 0.3459, "step": 4181 }, { "epoch": 380.1939393939394, "grad_norm": 0.2466476857662201, "learning_rate": 8.18e-06, "loss": 0.3371, "step": 4182 }, { "epoch": 380.2909090909091, "grad_norm": 0.29603132605552673, "learning_rate": 8.17e-06, "loss": 0.3512, "step": 4183 }, { "epoch": 380.3878787878788, "grad_norm": 0.29953810572624207, "learning_rate": 8.160000000000001e-06, "loss": 0.3179, "step": 4184 }, { "epoch": 380.4848484848485, "grad_norm": 0.2953645586967468, "learning_rate": 8.15e-06, "loss": 0.3459, "step": 4185 }, { "epoch": 380.58181818181816, "grad_norm": 0.3172391355037689, "learning_rate": 8.14e-06, "loss": 0.3277, "step": 4186 }, { "epoch": 380.6787878787879, "grad_norm": 0.29355430603027344, "learning_rate": 8.13e-06, "loss": 0.3377, "step": 4187 }, { "epoch": 380.77575757575755, "grad_norm": 0.3100706934928894, "learning_rate": 8.12e-06, "loss": 0.3176, "step": 4188 }, { "epoch": 380.8727272727273, "grad_norm": 0.27939969301223755, "learning_rate": 8.11e-06, "loss": 0.3162, "step": 4189 }, { "epoch": 380.969696969697, "grad_norm": 0.2738233804702759, "learning_rate": 8.1e-06, "loss": 0.3127, "step": 4190 }, { "epoch": 380.969696969697, "eval_loss": 0.4244054853916168, "eval_runtime": 2.1388, "eval_samples_per_second": 25.716, "eval_steps_per_second": 3.273, "step": 4190 }, { "epoch": 381.0, "grad_norm": 0.4269275367259979, "learning_rate": 8.09e-06, "loss": 0.2913, "step": 4191 }, { "epoch": 381.0969696969697, "grad_norm": 0.27549001574516296, "learning_rate": 8.08e-06, "loss": 0.3624, "step": 4192 }, { "epoch": 381.1939393939394, "grad_norm": 0.3019556999206543, "learning_rate": 8.069999999999999e-06, "loss": 0.3107, "step": 4193 }, { "epoch": 381.2909090909091, "grad_norm": 0.2658679485321045, "learning_rate": 8.06e-06, "loss": 0.327, "step": 4194 }, { "epoch": 381.3878787878788, "grad_norm": 0.2855111062526703, "learning_rate": 8.050000000000001e-06, "loss": 0.3253, "step": 4195 }, { "epoch": 381.4848484848485, "grad_norm": 0.3003518879413605, "learning_rate": 8.040000000000001e-06, "loss": 0.3105, "step": 4196 }, { "epoch": 381.58181818181816, "grad_norm": 0.31421568989753723, "learning_rate": 8.03e-06, "loss": 0.3181, "step": 4197 }, { "epoch": 381.6787878787879, "grad_norm": 0.257122278213501, "learning_rate": 8.02e-06, "loss": 0.3346, "step": 4198 }, { "epoch": 381.77575757575755, "grad_norm": 0.2668331265449524, "learning_rate": 8.010000000000001e-06, "loss": 0.3353, "step": 4199 }, { "epoch": 381.8727272727273, "grad_norm": 0.2797575294971466, "learning_rate": 8.000000000000001e-06, "loss": 0.3628, "step": 4200 }, { "epoch": 381.8727272727273, "eval_loss": 0.42448917031288147, "eval_runtime": 2.1343, "eval_samples_per_second": 25.769, "eval_steps_per_second": 3.28, "step": 4200 }, { "epoch": 381.969696969697, "grad_norm": 0.3442399203777313, "learning_rate": 7.99e-06, "loss": 0.3209, "step": 4201 }, { "epoch": 382.0, "grad_norm": 0.5818289518356323, "learning_rate": 7.98e-06, "loss": 0.297, "step": 4202 }, { "epoch": 382.0969696969697, "grad_norm": 0.3574053645133972, "learning_rate": 7.97e-06, "loss": 0.3459, "step": 4203 }, { "epoch": 382.1939393939394, "grad_norm": 0.3037893772125244, "learning_rate": 7.96e-06, "loss": 0.3341, "step": 4204 }, { "epoch": 382.2909090909091, "grad_norm": 0.29630377888679504, "learning_rate": 7.95e-06, "loss": 0.3139, "step": 4205 }, { "epoch": 382.3878787878788, "grad_norm": 0.2979125380516052, "learning_rate": 7.94e-06, "loss": 0.3363, "step": 4206 }, { "epoch": 382.4848484848485, "grad_norm": 0.320667028427124, "learning_rate": 7.93e-06, "loss": 0.3347, "step": 4207 }, { "epoch": 382.58181818181816, "grad_norm": 0.30430322885513306, "learning_rate": 7.92e-06, "loss": 0.3295, "step": 4208 }, { "epoch": 382.6787878787879, "grad_norm": 0.2292478084564209, "learning_rate": 7.91e-06, "loss": 0.3027, "step": 4209 }, { "epoch": 382.77575757575755, "grad_norm": 0.27211201190948486, "learning_rate": 7.9e-06, "loss": 0.3278, "step": 4210 }, { "epoch": 382.77575757575755, "eval_loss": 0.4243857264518738, "eval_runtime": 2.1481, "eval_samples_per_second": 25.604, "eval_steps_per_second": 3.259, "step": 4210 }, { "epoch": 382.8727272727273, "grad_norm": 0.3260558247566223, "learning_rate": 7.89e-06, "loss": 0.3245, "step": 4211 }, { "epoch": 382.969696969697, "grad_norm": 0.29133886098861694, "learning_rate": 7.879999999999999e-06, "loss": 0.3401, "step": 4212 }, { "epoch": 383.0, "grad_norm": 0.4845393896102905, "learning_rate": 7.870000000000001e-06, "loss": 0.3478, "step": 4213 }, { "epoch": 383.0969696969697, "grad_norm": 0.3398871421813965, "learning_rate": 7.860000000000001e-06, "loss": 0.3212, "step": 4214 }, { "epoch": 383.1939393939394, "grad_norm": 0.2844429016113281, "learning_rate": 7.850000000000001e-06, "loss": 0.3588, "step": 4215 }, { "epoch": 383.2909090909091, "grad_norm": 0.3390371799468994, "learning_rate": 7.84e-06, "loss": 0.2931, "step": 4216 }, { "epoch": 383.3878787878788, "grad_norm": 0.29903873801231384, "learning_rate": 7.83e-06, "loss": 0.3449, "step": 4217 }, { "epoch": 383.4848484848485, "grad_norm": 0.244550421833992, "learning_rate": 7.820000000000001e-06, "loss": 0.3379, "step": 4218 }, { "epoch": 383.58181818181816, "grad_norm": 0.28961002826690674, "learning_rate": 7.810000000000001e-06, "loss": 0.3522, "step": 4219 }, { "epoch": 383.6787878787879, "grad_norm": 0.2954062521457672, "learning_rate": 7.8e-06, "loss": 0.269, "step": 4220 }, { "epoch": 383.6787878787879, "eval_loss": 0.4247341454029083, "eval_runtime": 2.1211, "eval_samples_per_second": 25.93, "eval_steps_per_second": 3.3, "step": 4220 }, { "epoch": 383.77575757575755, "grad_norm": 0.2589881718158722, "learning_rate": 7.79e-06, "loss": 0.3359, "step": 4221 }, { "epoch": 383.8727272727273, "grad_norm": 0.3851045072078705, "learning_rate": 7.78e-06, "loss": 0.3301, "step": 4222 }, { "epoch": 383.969696969697, "grad_norm": 0.26817086338996887, "learning_rate": 7.77e-06, "loss": 0.3367, "step": 4223 }, { "epoch": 384.0, "grad_norm": 0.5171042084693909, "learning_rate": 7.76e-06, "loss": 0.3863, "step": 4224 }, { "epoch": 384.0969696969697, "grad_norm": 0.3027506172657013, "learning_rate": 7.75e-06, "loss": 0.3258, "step": 4225 }, { "epoch": 384.1939393939394, "grad_norm": 0.27451062202453613, "learning_rate": 7.74e-06, "loss": 0.3417, "step": 4226 }, { "epoch": 384.2909090909091, "grad_norm": 0.29753100872039795, "learning_rate": 7.73e-06, "loss": 0.331, "step": 4227 }, { "epoch": 384.3878787878788, "grad_norm": 0.31794247031211853, "learning_rate": 7.72e-06, "loss": 0.308, "step": 4228 }, { "epoch": 384.4848484848485, "grad_norm": 0.30627694725990295, "learning_rate": 7.71e-06, "loss": 0.3245, "step": 4229 }, { "epoch": 384.58181818181816, "grad_norm": 0.3163098692893982, "learning_rate": 7.7e-06, "loss": 0.3194, "step": 4230 }, { "epoch": 384.58181818181816, "eval_loss": 0.424343466758728, "eval_runtime": 2.13, "eval_samples_per_second": 25.821, "eval_steps_per_second": 3.286, "step": 4230 }, { "epoch": 384.6787878787879, "grad_norm": 0.2726805806159973, "learning_rate": 7.69e-06, "loss": 0.341, "step": 4231 }, { "epoch": 384.77575757575755, "grad_norm": 0.28615909814834595, "learning_rate": 7.68e-06, "loss": 0.3394, "step": 4232 }, { "epoch": 384.8727272727273, "grad_norm": 0.3741395175457001, "learning_rate": 7.670000000000001e-06, "loss": 0.337, "step": 4233 }, { "epoch": 384.969696969697, "grad_norm": 0.25299376249313354, "learning_rate": 7.660000000000001e-06, "loss": 0.3416, "step": 4234 }, { "epoch": 385.0, "grad_norm": 0.5409793853759766, "learning_rate": 7.65e-06, "loss": 0.2778, "step": 4235 }, { "epoch": 385.0969696969697, "grad_norm": 0.38123613595962524, "learning_rate": 7.64e-06, "loss": 0.3322, "step": 4236 }, { "epoch": 385.1939393939394, "grad_norm": 0.3055660128593445, "learning_rate": 7.630000000000001e-06, "loss": 0.357, "step": 4237 }, { "epoch": 385.2909090909091, "grad_norm": 0.285840779542923, "learning_rate": 7.620000000000001e-06, "loss": 0.3618, "step": 4238 }, { "epoch": 385.3878787878788, "grad_norm": 0.289868026971817, "learning_rate": 7.610000000000001e-06, "loss": 0.3113, "step": 4239 }, { "epoch": 385.4848484848485, "grad_norm": 0.26244574785232544, "learning_rate": 7.6e-06, "loss": 0.3111, "step": 4240 }, { "epoch": 385.4848484848485, "eval_loss": 0.4244261085987091, "eval_runtime": 2.1304, "eval_samples_per_second": 25.817, "eval_steps_per_second": 3.286, "step": 4240 }, { "epoch": 385.58181818181816, "grad_norm": 0.3094077408313751, "learning_rate": 7.59e-06, "loss": 0.3327, "step": 4241 }, { "epoch": 385.6787878787879, "grad_norm": 0.27038809657096863, "learning_rate": 7.580000000000001e-06, "loss": 0.3015, "step": 4242 }, { "epoch": 385.77575757575755, "grad_norm": 0.2761613428592682, "learning_rate": 7.57e-06, "loss": 0.2914, "step": 4243 }, { "epoch": 385.8727272727273, "grad_norm": 0.3535533845424652, "learning_rate": 7.5600000000000005e-06, "loss": 0.3411, "step": 4244 }, { "epoch": 385.969696969697, "grad_norm": 0.2971293032169342, "learning_rate": 7.55e-06, "loss": 0.3395, "step": 4245 }, { "epoch": 386.0, "grad_norm": 0.5061172246932983, "learning_rate": 7.54e-06, "loss": 0.3751, "step": 4246 }, { "epoch": 386.0969696969697, "grad_norm": 0.3000218868255615, "learning_rate": 7.530000000000001e-06, "loss": 0.3364, "step": 4247 }, { "epoch": 386.1939393939394, "grad_norm": 0.2683129906654358, "learning_rate": 7.520000000000001e-06, "loss": 0.3172, "step": 4248 }, { "epoch": 386.2909090909091, "grad_norm": 0.26465749740600586, "learning_rate": 7.51e-06, "loss": 0.2949, "step": 4249 }, { "epoch": 386.3878787878788, "grad_norm": 0.2655685544013977, "learning_rate": 7.5e-06, "loss": 0.3697, "step": 4250 }, { "epoch": 386.3878787878788, "eval_loss": 0.424681156873703, "eval_runtime": 2.1213, "eval_samples_per_second": 25.927, "eval_steps_per_second": 3.3, "step": 4250 }, { "epoch": 386.4848484848485, "grad_norm": 0.3396849036216736, "learning_rate": 7.4899999999999994e-06, "loss": 0.3515, "step": 4251 }, { "epoch": 386.58181818181816, "grad_norm": 0.31697672605514526, "learning_rate": 7.480000000000001e-06, "loss": 0.3149, "step": 4252 }, { "epoch": 386.6787878787879, "grad_norm": 0.2642710208892822, "learning_rate": 7.4700000000000005e-06, "loss": 0.2963, "step": 4253 }, { "epoch": 386.77575757575755, "grad_norm": 0.3324968218803406, "learning_rate": 7.4600000000000006e-06, "loss": 0.3494, "step": 4254 }, { "epoch": 386.8727272727273, "grad_norm": 0.3071610629558563, "learning_rate": 7.45e-06, "loss": 0.3504, "step": 4255 }, { "epoch": 386.969696969697, "grad_norm": 0.25106191635131836, "learning_rate": 7.44e-06, "loss": 0.3138, "step": 4256 }, { "epoch": 387.0, "grad_norm": 0.4752749800682068, "learning_rate": 7.430000000000001e-06, "loss": 0.3195, "step": 4257 }, { "epoch": 387.0969696969697, "grad_norm": 0.2990110218524933, "learning_rate": 7.420000000000001e-06, "loss": 0.2864, "step": 4258 }, { "epoch": 387.1939393939394, "grad_norm": 0.2590501010417938, "learning_rate": 7.41e-06, "loss": 0.3396, "step": 4259 }, { "epoch": 387.2909090909091, "grad_norm": 0.292054146528244, "learning_rate": 7.4e-06, "loss": 0.3605, "step": 4260 }, { "epoch": 387.2909090909091, "eval_loss": 0.4244232773780823, "eval_runtime": 2.13, "eval_samples_per_second": 25.822, "eval_steps_per_second": 3.286, "step": 4260 }, { "epoch": 387.3878787878788, "grad_norm": 0.31396231055259705, "learning_rate": 7.3899999999999995e-06, "loss": 0.3332, "step": 4261 }, { "epoch": 387.4848484848485, "grad_norm": 0.46617692708969116, "learning_rate": 7.3800000000000005e-06, "loss": 0.3425, "step": 4262 }, { "epoch": 387.58181818181816, "grad_norm": 0.3220658004283905, "learning_rate": 7.370000000000001e-06, "loss": 0.3119, "step": 4263 }, { "epoch": 387.6787878787879, "grad_norm": 0.2932121157646179, "learning_rate": 7.36e-06, "loss": 0.3344, "step": 4264 }, { "epoch": 387.77575757575755, "grad_norm": 0.3561127483844757, "learning_rate": 7.35e-06, "loss": 0.3304, "step": 4265 }, { "epoch": 387.8727272727273, "grad_norm": 0.316982626914978, "learning_rate": 7.340000000000001e-06, "loss": 0.3156, "step": 4266 }, { "epoch": 387.969696969697, "grad_norm": 0.34779998660087585, "learning_rate": 7.330000000000001e-06, "loss": 0.3207, "step": 4267 }, { "epoch": 388.0, "grad_norm": 0.618858277797699, "learning_rate": 7.32e-06, "loss": 0.3917, "step": 4268 }, { "epoch": 388.0969696969697, "grad_norm": 0.27004459500312805, "learning_rate": 7.31e-06, "loss": 0.3537, "step": 4269 }, { "epoch": 388.1939393939394, "grad_norm": 0.3083222508430481, "learning_rate": 7.2999999999999996e-06, "loss": 0.3302, "step": 4270 }, { "epoch": 388.1939393939394, "eval_loss": 0.4242541193962097, "eval_runtime": 2.1265, "eval_samples_per_second": 25.864, "eval_steps_per_second": 3.292, "step": 4270 }, { "epoch": 388.2909090909091, "grad_norm": 0.2764505445957184, "learning_rate": 7.290000000000001e-06, "loss": 0.3211, "step": 4271 }, { "epoch": 388.3878787878788, "grad_norm": 0.2886534333229065, "learning_rate": 7.280000000000001e-06, "loss": 0.3287, "step": 4272 }, { "epoch": 388.4848484848485, "grad_norm": 0.32928892970085144, "learning_rate": 7.270000000000001e-06, "loss": 0.3167, "step": 4273 }, { "epoch": 388.58181818181816, "grad_norm": 0.2919245660305023, "learning_rate": 7.26e-06, "loss": 0.337, "step": 4274 }, { "epoch": 388.6787878787879, "grad_norm": 0.26534411311149597, "learning_rate": 7.25e-06, "loss": 0.303, "step": 4275 }, { "epoch": 388.77575757575755, "grad_norm": 0.31570714712142944, "learning_rate": 7.240000000000001e-06, "loss": 0.3339, "step": 4276 }, { "epoch": 388.8727272727273, "grad_norm": 0.2628118395805359, "learning_rate": 7.230000000000001e-06, "loss": 0.3206, "step": 4277 }, { "epoch": 388.969696969697, "grad_norm": 0.29525238275527954, "learning_rate": 7.22e-06, "loss": 0.3514, "step": 4278 }, { "epoch": 389.0, "grad_norm": 0.4390280842781067, "learning_rate": 7.2100000000000004e-06, "loss": 0.3143, "step": 4279 }, { "epoch": 389.0969696969697, "grad_norm": 0.27595826983451843, "learning_rate": 7.2e-06, "loss": 0.3055, "step": 4280 }, { "epoch": 389.0969696969697, "eval_loss": 0.4243329167366028, "eval_runtime": 2.1301, "eval_samples_per_second": 25.82, "eval_steps_per_second": 3.286, "step": 4280 }, { "epoch": 389.1939393939394, "grad_norm": 0.2787480056285858, "learning_rate": 7.190000000000001e-06, "loss": 0.3158, "step": 4281 }, { "epoch": 389.2909090909091, "grad_norm": 0.3227105736732483, "learning_rate": 7.180000000000001e-06, "loss": 0.3747, "step": 4282 }, { "epoch": 389.3878787878788, "grad_norm": 0.29996225237846375, "learning_rate": 7.17e-06, "loss": 0.3398, "step": 4283 }, { "epoch": 389.4848484848485, "grad_norm": 0.269465833902359, "learning_rate": 7.16e-06, "loss": 0.2991, "step": 4284 }, { "epoch": 389.58181818181816, "grad_norm": 0.3202161490917206, "learning_rate": 7.15e-06, "loss": 0.335, "step": 4285 }, { "epoch": 389.6787878787879, "grad_norm": 0.24681954085826874, "learning_rate": 7.140000000000001e-06, "loss": 0.3572, "step": 4286 }, { "epoch": 389.77575757575755, "grad_norm": 0.2774510681629181, "learning_rate": 7.13e-06, "loss": 0.3156, "step": 4287 }, { "epoch": 389.8727272727273, "grad_norm": 0.3508288562297821, "learning_rate": 7.1200000000000004e-06, "loss": 0.3452, "step": 4288 }, { "epoch": 389.969696969697, "grad_norm": 0.27009424567222595, "learning_rate": 7.11e-06, "loss": 0.2825, "step": 4289 }, { "epoch": 390.0, "grad_norm": 0.6318101286888123, "learning_rate": 7.1e-06, "loss": 0.4019, "step": 4290 }, { "epoch": 390.0, "eval_loss": 0.42439505457878113, "eval_runtime": 2.1324, "eval_samples_per_second": 25.793, "eval_steps_per_second": 3.283, "step": 4290 }, { "epoch": 390.0969696969697, "grad_norm": 0.26486244797706604, "learning_rate": 7.090000000000001e-06, "loss": 0.3427, "step": 4291 }, { "epoch": 390.1939393939394, "grad_norm": 0.3479774296283722, "learning_rate": 7.080000000000001e-06, "loss": 0.3538, "step": 4292 }, { "epoch": 390.2909090909091, "grad_norm": 0.2837066352367401, "learning_rate": 7.07e-06, "loss": 0.3171, "step": 4293 }, { "epoch": 390.3878787878788, "grad_norm": 0.2607334852218628, "learning_rate": 7.06e-06, "loss": 0.2954, "step": 4294 }, { "epoch": 390.4848484848485, "grad_norm": 0.3070979118347168, "learning_rate": 7.049999999999999e-06, "loss": 0.3584, "step": 4295 }, { "epoch": 390.58181818181816, "grad_norm": 0.3067142963409424, "learning_rate": 7.04e-06, "loss": 0.3348, "step": 4296 }, { "epoch": 390.6787878787879, "grad_norm": 0.2787237763404846, "learning_rate": 7.0300000000000005e-06, "loss": 0.3376, "step": 4297 }, { "epoch": 390.77575757575755, "grad_norm": 0.2802588641643524, "learning_rate": 7.0200000000000006e-06, "loss": 0.3186, "step": 4298 }, { "epoch": 390.8727272727273, "grad_norm": 0.2827461361885071, "learning_rate": 7.01e-06, "loss": 0.3325, "step": 4299 }, { "epoch": 390.969696969697, "grad_norm": 0.3040320575237274, "learning_rate": 7.000000000000001e-06, "loss": 0.3013, "step": 4300 }, { "epoch": 390.969696969697, "eval_loss": 0.42447808384895325, "eval_runtime": 2.1287, "eval_samples_per_second": 25.837, "eval_steps_per_second": 3.288, "step": 4300 }, { "epoch": 391.0, "grad_norm": 0.3805631697177887, "learning_rate": 6.990000000000001e-06, "loss": 0.3153, "step": 4301 }, { "epoch": 391.0969696969697, "grad_norm": 0.30007538199424744, "learning_rate": 6.98e-06, "loss": 0.3193, "step": 4302 }, { "epoch": 391.1939393939394, "grad_norm": 0.25372499227523804, "learning_rate": 6.97e-06, "loss": 0.3496, "step": 4303 }, { "epoch": 391.2909090909091, "grad_norm": 0.5099557042121887, "learning_rate": 6.9599999999999994e-06, "loss": 0.2949, "step": 4304 }, { "epoch": 391.3878787878788, "grad_norm": 0.2995280623435974, "learning_rate": 6.950000000000001e-06, "loss": 0.3355, "step": 4305 }, { "epoch": 391.4848484848485, "grad_norm": 0.26424339413642883, "learning_rate": 6.9400000000000005e-06, "loss": 0.358, "step": 4306 }, { "epoch": 391.58181818181816, "grad_norm": 0.33444005250930786, "learning_rate": 6.9300000000000006e-06, "loss": 0.3335, "step": 4307 }, { "epoch": 391.6787878787879, "grad_norm": 0.29056233167648315, "learning_rate": 6.92e-06, "loss": 0.3384, "step": 4308 }, { "epoch": 391.77575757575755, "grad_norm": 0.3315895199775696, "learning_rate": 6.91e-06, "loss": 0.3234, "step": 4309 }, { "epoch": 391.8727272727273, "grad_norm": 0.2590215504169464, "learning_rate": 6.900000000000001e-06, "loss": 0.3019, "step": 4310 }, { "epoch": 391.8727272727273, "eval_loss": 0.42437347769737244, "eval_runtime": 2.1192, "eval_samples_per_second": 25.953, "eval_steps_per_second": 3.303, "step": 4310 }, { "epoch": 391.969696969697, "grad_norm": 0.28518152236938477, "learning_rate": 6.890000000000001e-06, "loss": 0.3496, "step": 4311 }, { "epoch": 392.0, "grad_norm": 0.5114558339118958, "learning_rate": 6.88e-06, "loss": 0.2863, "step": 4312 }, { "epoch": 392.0969696969697, "grad_norm": 0.2683810293674469, "learning_rate": 6.87e-06, "loss": 0.3313, "step": 4313 }, { "epoch": 392.1939393939394, "grad_norm": 0.29235902428627014, "learning_rate": 6.8599999999999995e-06, "loss": 0.3362, "step": 4314 }, { "epoch": 392.2909090909091, "grad_norm": 0.2714664041996002, "learning_rate": 6.8500000000000005e-06, "loss": 0.3688, "step": 4315 }, { "epoch": 392.3878787878788, "grad_norm": 0.2590443193912506, "learning_rate": 6.840000000000001e-06, "loss": 0.3377, "step": 4316 }, { "epoch": 392.4848484848485, "grad_norm": 0.32732295989990234, "learning_rate": 6.830000000000001e-06, "loss": 0.3176, "step": 4317 }, { "epoch": 392.58181818181816, "grad_norm": 0.2644732594490051, "learning_rate": 6.82e-06, "loss": 0.3194, "step": 4318 }, { "epoch": 392.6787878787879, "grad_norm": 0.29884040355682373, "learning_rate": 6.81e-06, "loss": 0.3386, "step": 4319 }, { "epoch": 392.77575757575755, "grad_norm": 0.3094496428966522, "learning_rate": 6.800000000000001e-06, "loss": 0.327, "step": 4320 }, { "epoch": 392.77575757575755, "eval_loss": 0.42413949966430664, "eval_runtime": 2.1306, "eval_samples_per_second": 25.814, "eval_steps_per_second": 3.285, "step": 4320 }, { "epoch": 392.8727272727273, "grad_norm": 0.2963372766971588, "learning_rate": 6.79e-06, "loss": 0.2948, "step": 4321 }, { "epoch": 392.969696969697, "grad_norm": 0.25973087549209595, "learning_rate": 6.78e-06, "loss": 0.3123, "step": 4322 }, { "epoch": 393.0, "grad_norm": 0.8426999449729919, "learning_rate": 6.7699999999999996e-06, "loss": 0.3366, "step": 4323 }, { "epoch": 393.0969696969697, "grad_norm": 0.26413246989250183, "learning_rate": 6.76e-06, "loss": 0.362, "step": 4324 }, { "epoch": 393.1939393939394, "grad_norm": 0.31809934973716736, "learning_rate": 6.750000000000001e-06, "loss": 0.3461, "step": 4325 }, { "epoch": 393.2909090909091, "grad_norm": 0.28155213594436646, "learning_rate": 6.740000000000001e-06, "loss": 0.3144, "step": 4326 }, { "epoch": 393.3878787878788, "grad_norm": 0.3532550036907196, "learning_rate": 6.73e-06, "loss": 0.3669, "step": 4327 }, { "epoch": 393.4848484848485, "grad_norm": 0.30630025267601013, "learning_rate": 6.72e-06, "loss": 0.355, "step": 4328 }, { "epoch": 393.58181818181816, "grad_norm": 0.2951465845108032, "learning_rate": 6.710000000000001e-06, "loss": 0.3199, "step": 4329 }, { "epoch": 393.6787878787879, "grad_norm": 0.2646746039390564, "learning_rate": 6.700000000000001e-06, "loss": 0.3071, "step": 4330 }, { "epoch": 393.6787878787879, "eval_loss": 0.4244401454925537, "eval_runtime": 2.1167, "eval_samples_per_second": 25.984, "eval_steps_per_second": 3.307, "step": 4330 }, { "epoch": 393.77575757575755, "grad_norm": 0.27225181460380554, "learning_rate": 6.69e-06, "loss": 0.2883, "step": 4331 }, { "epoch": 393.8727272727273, "grad_norm": 0.2778588831424713, "learning_rate": 6.68e-06, "loss": 0.3288, "step": 4332 }, { "epoch": 393.969696969697, "grad_norm": 0.3167281746864319, "learning_rate": 6.67e-06, "loss": 0.2983, "step": 4333 }, { "epoch": 394.0, "grad_norm": 0.5620117783546448, "learning_rate": 6.660000000000001e-06, "loss": 0.3396, "step": 4334 }, { "epoch": 394.0969696969697, "grad_norm": 0.33203279972076416, "learning_rate": 6.650000000000001e-06, "loss": 0.3326, "step": 4335 }, { "epoch": 394.1939393939394, "grad_norm": 0.27428722381591797, "learning_rate": 6.640000000000001e-06, "loss": 0.3455, "step": 4336 }, { "epoch": 394.2909090909091, "grad_norm": 0.26442083716392517, "learning_rate": 6.63e-06, "loss": 0.3129, "step": 4337 }, { "epoch": 394.3878787878788, "grad_norm": 0.29959186911582947, "learning_rate": 6.62e-06, "loss": 0.2967, "step": 4338 }, { "epoch": 394.4848484848485, "grad_norm": 0.26450297236442566, "learning_rate": 6.610000000000001e-06, "loss": 0.3061, "step": 4339 }, { "epoch": 394.58181818181816, "grad_norm": 0.3069596290588379, "learning_rate": 6.6e-06, "loss": 0.3369, "step": 4340 }, { "epoch": 394.58181818181816, "eval_loss": 0.42409956455230713, "eval_runtime": 2.1323, "eval_samples_per_second": 25.794, "eval_steps_per_second": 3.283, "step": 4340 }, { "epoch": 394.6787878787879, "grad_norm": 0.2847595810890198, "learning_rate": 6.5900000000000004e-06, "loss": 0.3158, "step": 4341 }, { "epoch": 394.77575757575755, "grad_norm": 0.33862245082855225, "learning_rate": 6.58e-06, "loss": 0.3478, "step": 4342 }, { "epoch": 394.8727272727273, "grad_norm": 0.3017573058605194, "learning_rate": 6.57e-06, "loss": 0.3384, "step": 4343 }, { "epoch": 394.969696969697, "grad_norm": 0.2776963710784912, "learning_rate": 6.560000000000001e-06, "loss": 0.3544, "step": 4344 }, { "epoch": 395.0, "grad_norm": 0.5282799005508423, "learning_rate": 6.550000000000001e-06, "loss": 0.323, "step": 4345 }, { "epoch": 395.0969696969697, "grad_norm": 0.31962233781814575, "learning_rate": 6.54e-06, "loss": 0.3292, "step": 4346 }, { "epoch": 395.1939393939394, "grad_norm": 0.28816673159599304, "learning_rate": 6.53e-06, "loss": 0.357, "step": 4347 }, { "epoch": 395.2909090909091, "grad_norm": 0.264009952545166, "learning_rate": 6.519999999999999e-06, "loss": 0.2984, "step": 4348 }, { "epoch": 395.3878787878788, "grad_norm": 0.2885288596153259, "learning_rate": 6.510000000000001e-06, "loss": 0.3199, "step": 4349 }, { "epoch": 395.4848484848485, "grad_norm": 0.32528144121170044, "learning_rate": 6.5000000000000004e-06, "loss": 0.3225, "step": 4350 }, { "epoch": 395.4848484848485, "eval_loss": 0.4243115186691284, "eval_runtime": 2.1296, "eval_samples_per_second": 25.826, "eval_steps_per_second": 3.287, "step": 4350 }, { "epoch": 395.58181818181816, "grad_norm": 0.270365446805954, "learning_rate": 6.4900000000000005e-06, "loss": 0.3231, "step": 4351 }, { "epoch": 395.6787878787879, "grad_norm": 0.2635614275932312, "learning_rate": 6.48e-06, "loss": 0.3142, "step": 4352 }, { "epoch": 395.77575757575755, "grad_norm": 0.2843759059906006, "learning_rate": 6.47e-06, "loss": 0.3191, "step": 4353 }, { "epoch": 395.8727272727273, "grad_norm": 0.441561222076416, "learning_rate": 6.460000000000001e-06, "loss": 0.3365, "step": 4354 }, { "epoch": 395.969696969697, "grad_norm": 0.30812638998031616, "learning_rate": 6.45e-06, "loss": 0.3594, "step": 4355 }, { "epoch": 396.0, "grad_norm": 0.418353796005249, "learning_rate": 6.44e-06, "loss": 0.3349, "step": 4356 }, { "epoch": 396.0969696969697, "grad_norm": 0.27173471450805664, "learning_rate": 6.43e-06, "loss": 0.309, "step": 4357 }, { "epoch": 396.1939393939394, "grad_norm": 0.31804460287094116, "learning_rate": 6.4199999999999995e-06, "loss": 0.3391, "step": 4358 }, { "epoch": 396.2909090909091, "grad_norm": 0.29313576221466064, "learning_rate": 6.4100000000000005e-06, "loss": 0.3235, "step": 4359 }, { "epoch": 396.3878787878788, "grad_norm": 0.2768804430961609, "learning_rate": 6.4000000000000006e-06, "loss": 0.3105, "step": 4360 }, { "epoch": 396.3878787878788, "eval_loss": 0.42421257495880127, "eval_runtime": 2.1208, "eval_samples_per_second": 25.934, "eval_steps_per_second": 3.301, "step": 4360 }, { "epoch": 396.4848484848485, "grad_norm": 0.33066731691360474, "learning_rate": 6.39e-06, "loss": 0.3546, "step": 4361 }, { "epoch": 396.58181818181816, "grad_norm": 0.28355875611305237, "learning_rate": 6.38e-06, "loss": 0.3148, "step": 4362 }, { "epoch": 396.6787878787879, "grad_norm": 0.4002547860145569, "learning_rate": 6.370000000000001e-06, "loss": 0.3381, "step": 4363 }, { "epoch": 396.77575757575755, "grad_norm": 0.25764694809913635, "learning_rate": 6.360000000000001e-06, "loss": 0.3304, "step": 4364 }, { "epoch": 396.8727272727273, "grad_norm": 0.37603357434272766, "learning_rate": 6.35e-06, "loss": 0.3101, "step": 4365 }, { "epoch": 396.969696969697, "grad_norm": 0.31180375814437866, "learning_rate": 6.34e-06, "loss": 0.3443, "step": 4366 }, { "epoch": 397.0, "grad_norm": 0.5192236304283142, "learning_rate": 6.3299999999999995e-06, "loss": 0.3664, "step": 4367 }, { "epoch": 397.0969696969697, "grad_norm": 0.2815007269382477, "learning_rate": 6.320000000000001e-06, "loss": 0.3054, "step": 4368 }, { "epoch": 397.1939393939394, "grad_norm": 0.27915215492248535, "learning_rate": 6.3100000000000006e-06, "loss": 0.3342, "step": 4369 }, { "epoch": 397.2909090909091, "grad_norm": 0.3028716444969177, "learning_rate": 6.300000000000001e-06, "loss": 0.312, "step": 4370 }, { "epoch": 397.2909090909091, "eval_loss": 0.42403414845466614, "eval_runtime": 2.1218, "eval_samples_per_second": 25.921, "eval_steps_per_second": 3.299, "step": 4370 }, { "epoch": 397.3878787878788, "grad_norm": 0.26431724429130554, "learning_rate": 6.29e-06, "loss": 0.3532, "step": 4371 }, { "epoch": 397.4848484848485, "grad_norm": 0.26671725511550903, "learning_rate": 6.28e-06, "loss": 0.3397, "step": 4372 }, { "epoch": 397.58181818181816, "grad_norm": 0.26035550236701965, "learning_rate": 6.270000000000001e-06, "loss": 0.2852, "step": 4373 }, { "epoch": 397.6787878787879, "grad_norm": 0.29644638299942017, "learning_rate": 6.26e-06, "loss": 0.307, "step": 4374 }, { "epoch": 397.77575757575755, "grad_norm": 0.29415082931518555, "learning_rate": 6.25e-06, "loss": 0.3362, "step": 4375 }, { "epoch": 397.8727272727273, "grad_norm": 0.29705363512039185, "learning_rate": 6.24e-06, "loss": 0.3451, "step": 4376 }, { "epoch": 397.969696969697, "grad_norm": 0.3315047323703766, "learning_rate": 6.2300000000000005e-06, "loss": 0.3535, "step": 4377 }, { "epoch": 398.0, "grad_norm": 0.6073850989341736, "learning_rate": 6.22e-06, "loss": 0.3566, "step": 4378 }, { "epoch": 398.0969696969697, "grad_norm": 0.2666085660457611, "learning_rate": 6.210000000000001e-06, "loss": 0.3293, "step": 4379 }, { "epoch": 398.1939393939394, "grad_norm": 0.2885405421257019, "learning_rate": 6.2e-06, "loss": 0.346, "step": 4380 }, { "epoch": 398.1939393939394, "eval_loss": 0.42399489879608154, "eval_runtime": 2.1503, "eval_samples_per_second": 25.578, "eval_steps_per_second": 3.255, "step": 4380 }, { "epoch": 398.2909090909091, "grad_norm": 0.3000956177711487, "learning_rate": 6.19e-06, "loss": 0.3534, "step": 4381 }, { "epoch": 398.3878787878788, "grad_norm": 0.2843852937221527, "learning_rate": 6.18e-06, "loss": 0.2892, "step": 4382 }, { "epoch": 398.4848484848485, "grad_norm": 0.278983473777771, "learning_rate": 6.17e-06, "loss": 0.3321, "step": 4383 }, { "epoch": 398.58181818181816, "grad_norm": 0.24275529384613037, "learning_rate": 6.16e-06, "loss": 0.3351, "step": 4384 }, { "epoch": 398.6787878787879, "grad_norm": 0.2708281874656677, "learning_rate": 6.15e-06, "loss": 0.3005, "step": 4385 }, { "epoch": 398.77575757575755, "grad_norm": 0.32019880414009094, "learning_rate": 6.1400000000000005e-06, "loss": 0.3409, "step": 4386 }, { "epoch": 398.8727272727273, "grad_norm": 0.32802683115005493, "learning_rate": 6.130000000000001e-06, "loss": 0.3375, "step": 4387 }, { "epoch": 398.969696969697, "grad_norm": 0.3256203234195709, "learning_rate": 6.12e-06, "loss": 0.321, "step": 4388 }, { "epoch": 399.0, "grad_norm": 0.49412262439727783, "learning_rate": 6.110000000000001e-06, "loss": 0.3175, "step": 4389 }, { "epoch": 399.0969696969697, "grad_norm": 0.2912564277648926, "learning_rate": 6.1e-06, "loss": 0.328, "step": 4390 }, { "epoch": 399.0969696969697, "eval_loss": 0.4240299165248871, "eval_runtime": 2.1219, "eval_samples_per_second": 25.92, "eval_steps_per_second": 3.299, "step": 4390 }, { "epoch": 399.1939393939394, "grad_norm": 0.30961018800735474, "learning_rate": 6.090000000000001e-06, "loss": 0.2733, "step": 4391 }, { "epoch": 399.2909090909091, "grad_norm": 0.284614622592926, "learning_rate": 6.08e-06, "loss": 0.3479, "step": 4392 }, { "epoch": 399.3878787878788, "grad_norm": 0.2767944633960724, "learning_rate": 6.07e-06, "loss": 0.3286, "step": 4393 }, { "epoch": 399.4848484848485, "grad_norm": 0.30359727144241333, "learning_rate": 6.0600000000000004e-06, "loss": 0.3227, "step": 4394 }, { "epoch": 399.58181818181816, "grad_norm": 0.3214990794658661, "learning_rate": 6.0500000000000005e-06, "loss": 0.3438, "step": 4395 }, { "epoch": 399.6787878787879, "grad_norm": 0.28994226455688477, "learning_rate": 6.040000000000001e-06, "loss": 0.3691, "step": 4396 }, { "epoch": 399.77575757575755, "grad_norm": 0.2830333113670349, "learning_rate": 6.03e-06, "loss": 0.3225, "step": 4397 }, { "epoch": 399.8727272727273, "grad_norm": 0.2534017562866211, "learning_rate": 6.02e-06, "loss": 0.3475, "step": 4398 }, { "epoch": 399.969696969697, "grad_norm": 0.2874850630760193, "learning_rate": 6.01e-06, "loss": 0.3051, "step": 4399 }, { "epoch": 400.0, "grad_norm": 0.5695606470108032, "learning_rate": 6e-06, "loss": 0.3072, "step": 4400 }, { "epoch": 400.0, "eval_loss": 0.4242197275161743, "eval_runtime": 2.1336, "eval_samples_per_second": 25.778, "eval_steps_per_second": 3.281, "step": 4400 }, { "epoch": 400.0969696969697, "grad_norm": 0.2943519949913025, "learning_rate": 5.99e-06, "loss": 0.3076, "step": 4401 }, { "epoch": 400.1939393939394, "grad_norm": 0.26126110553741455, "learning_rate": 5.98e-06, "loss": 0.355, "step": 4402 }, { "epoch": 400.2909090909091, "grad_norm": 0.2612912654876709, "learning_rate": 5.9700000000000004e-06, "loss": 0.326, "step": 4403 }, { "epoch": 400.3878787878788, "grad_norm": 0.28973740339279175, "learning_rate": 5.9600000000000005e-06, "loss": 0.34, "step": 4404 }, { "epoch": 400.4848484848485, "grad_norm": 0.28152215480804443, "learning_rate": 5.95e-06, "loss": 0.3691, "step": 4405 }, { "epoch": 400.58181818181816, "grad_norm": 0.2883203327655792, "learning_rate": 5.940000000000001e-06, "loss": 0.3574, "step": 4406 }, { "epoch": 400.6787878787879, "grad_norm": 0.3067658841609955, "learning_rate": 5.93e-06, "loss": 0.2759, "step": 4407 }, { "epoch": 400.77575757575755, "grad_norm": 0.265886127948761, "learning_rate": 5.920000000000001e-06, "loss": 0.3187, "step": 4408 }, { "epoch": 400.8727272727273, "grad_norm": 0.34973838925361633, "learning_rate": 5.91e-06, "loss": 0.3129, "step": 4409 }, { "epoch": 400.969696969697, "grad_norm": 0.2948808968067169, "learning_rate": 5.9e-06, "loss": 0.3399, "step": 4410 }, { "epoch": 400.969696969697, "eval_loss": 0.42390140891075134, "eval_runtime": 2.1249, "eval_samples_per_second": 25.884, "eval_steps_per_second": 3.294, "step": 4410 }, { "epoch": 401.0, "grad_norm": 0.5547888875007629, "learning_rate": 5.89e-06, "loss": 0.2722, "step": 4411 }, { "epoch": 401.0969696969697, "grad_norm": 0.28168249130249023, "learning_rate": 5.8800000000000005e-06, "loss": 0.3197, "step": 4412 }, { "epoch": 401.1939393939394, "grad_norm": 0.32535097002983093, "learning_rate": 5.8700000000000005e-06, "loss": 0.3446, "step": 4413 }, { "epoch": 401.2909090909091, "grad_norm": 0.3320438265800476, "learning_rate": 5.86e-06, "loss": 0.3055, "step": 4414 }, { "epoch": 401.3878787878788, "grad_norm": 0.33057060837745667, "learning_rate": 5.850000000000001e-06, "loss": 0.3342, "step": 4415 }, { "epoch": 401.4848484848485, "grad_norm": 0.3710538148880005, "learning_rate": 5.84e-06, "loss": 0.3305, "step": 4416 }, { "epoch": 401.58181818181816, "grad_norm": 0.3122580647468567, "learning_rate": 5.83e-06, "loss": 0.3178, "step": 4417 }, { "epoch": 401.6787878787879, "grad_norm": 0.24265459179878235, "learning_rate": 5.82e-06, "loss": 0.3026, "step": 4418 }, { "epoch": 401.77575757575755, "grad_norm": 0.2679807245731354, "learning_rate": 5.81e-06, "loss": 0.33, "step": 4419 }, { "epoch": 401.8727272727273, "grad_norm": 0.28795769810676575, "learning_rate": 5.8e-06, "loss": 0.342, "step": 4420 }, { "epoch": 401.8727272727273, "eval_loss": 0.4240061044692993, "eval_runtime": 2.118, "eval_samples_per_second": 25.968, "eval_steps_per_second": 3.305, "step": 4420 }, { "epoch": 401.969696969697, "grad_norm": 0.3102927505970001, "learning_rate": 5.7900000000000005e-06, "loss": 0.3402, "step": 4421 }, { "epoch": 402.0, "grad_norm": 0.4905965328216553, "learning_rate": 5.78e-06, "loss": 0.3695, "step": 4422 }, { "epoch": 402.0969696969697, "grad_norm": 0.2856130003929138, "learning_rate": 5.770000000000001e-06, "loss": 0.3564, "step": 4423 }, { "epoch": 402.1939393939394, "grad_norm": 0.3797096014022827, "learning_rate": 5.76e-06, "loss": 0.2945, "step": 4424 }, { "epoch": 402.2909090909091, "grad_norm": 0.30673947930336, "learning_rate": 5.750000000000001e-06, "loss": 0.331, "step": 4425 }, { "epoch": 402.3878787878788, "grad_norm": 0.32168588042259216, "learning_rate": 5.74e-06, "loss": 0.3345, "step": 4426 }, { "epoch": 402.4848484848485, "grad_norm": 0.3218797743320465, "learning_rate": 5.73e-06, "loss": 0.3298, "step": 4427 }, { "epoch": 402.58181818181816, "grad_norm": 0.2737649977207184, "learning_rate": 5.72e-06, "loss": 0.3461, "step": 4428 }, { "epoch": 402.6787878787879, "grad_norm": 0.28247711062431335, "learning_rate": 5.71e-06, "loss": 0.2937, "step": 4429 }, { "epoch": 402.77575757575755, "grad_norm": 0.33665931224823, "learning_rate": 5.7000000000000005e-06, "loss": 0.3385, "step": 4430 }, { "epoch": 402.77575757575755, "eval_loss": 0.4240177869796753, "eval_runtime": 2.1337, "eval_samples_per_second": 25.777, "eval_steps_per_second": 3.281, "step": 4430 }, { "epoch": 402.8727272727273, "grad_norm": 0.2630540430545807, "learning_rate": 5.690000000000001e-06, "loss": 0.296, "step": 4431 }, { "epoch": 402.969696969697, "grad_norm": 0.3100334703922272, "learning_rate": 5.680000000000001e-06, "loss": 0.3603, "step": 4432 }, { "epoch": 403.0, "grad_norm": 0.5338719487190247, "learning_rate": 5.67e-06, "loss": 0.3289, "step": 4433 }, { "epoch": 403.0969696969697, "grad_norm": 0.3095831871032715, "learning_rate": 5.66e-06, "loss": 0.3444, "step": 4434 }, { "epoch": 403.1939393939394, "grad_norm": 0.24900095164775848, "learning_rate": 5.65e-06, "loss": 0.2999, "step": 4435 }, { "epoch": 403.2909090909091, "grad_norm": 0.28118544816970825, "learning_rate": 5.64e-06, "loss": 0.3081, "step": 4436 }, { "epoch": 403.3878787878788, "grad_norm": 0.3060985803604126, "learning_rate": 5.63e-06, "loss": 0.3327, "step": 4437 }, { "epoch": 403.4848484848485, "grad_norm": 0.252957284450531, "learning_rate": 5.62e-06, "loss": 0.3445, "step": 4438 }, { "epoch": 403.58181818181816, "grad_norm": 0.2750088572502136, "learning_rate": 5.61e-06, "loss": 0.3123, "step": 4439 }, { "epoch": 403.6787878787879, "grad_norm": 0.3438875079154968, "learning_rate": 5.600000000000001e-06, "loss": 0.3181, "step": 4440 }, { "epoch": 403.6787878787879, "eval_loss": 0.42397940158843994, "eval_runtime": 2.1375, "eval_samples_per_second": 25.732, "eval_steps_per_second": 3.275, "step": 4440 }, { "epoch": 403.77575757575755, "grad_norm": 0.2744629681110382, "learning_rate": 5.59e-06, "loss": 0.3433, "step": 4441 }, { "epoch": 403.8727272727273, "grad_norm": 0.29330527782440186, "learning_rate": 5.580000000000001e-06, "loss": 0.3222, "step": 4442 }, { "epoch": 403.969696969697, "grad_norm": 0.2824782431125641, "learning_rate": 5.57e-06, "loss": 0.3402, "step": 4443 }, { "epoch": 404.0, "grad_norm": 0.6952817440032959, "learning_rate": 5.56e-06, "loss": 0.3728, "step": 4444 }, { "epoch": 404.0969696969697, "grad_norm": 0.3297734558582306, "learning_rate": 5.55e-06, "loss": 0.3097, "step": 4445 }, { "epoch": 404.1939393939394, "grad_norm": 0.29496389627456665, "learning_rate": 5.54e-06, "loss": 0.3369, "step": 4446 }, { "epoch": 404.2909090909091, "grad_norm": 0.30634990334510803, "learning_rate": 5.53e-06, "loss": 0.3165, "step": 4447 }, { "epoch": 404.3878787878788, "grad_norm": 0.25977566838264465, "learning_rate": 5.5200000000000005e-06, "loss": 0.3423, "step": 4448 }, { "epoch": 404.4848484848485, "grad_norm": 0.30670416355133057, "learning_rate": 5.510000000000001e-06, "loss": 0.3214, "step": 4449 }, { "epoch": 404.58181818181816, "grad_norm": 0.41724225878715515, "learning_rate": 5.500000000000001e-06, "loss": 0.327, "step": 4450 }, { "epoch": 404.58181818181816, "eval_loss": 0.423881471157074, "eval_runtime": 2.1176, "eval_samples_per_second": 25.973, "eval_steps_per_second": 3.306, "step": 4450 }, { "epoch": 404.6787878787879, "grad_norm": 0.2812544107437134, "learning_rate": 5.49e-06, "loss": 0.3541, "step": 4451 }, { "epoch": 404.77575757575755, "grad_norm": 0.3451825678348541, "learning_rate": 5.48e-06, "loss": 0.3401, "step": 4452 }, { "epoch": 404.8727272727273, "grad_norm": 0.2523994445800781, "learning_rate": 5.47e-06, "loss": 0.3159, "step": 4453 }, { "epoch": 404.969696969697, "grad_norm": 0.2768281400203705, "learning_rate": 5.46e-06, "loss": 0.3254, "step": 4454 }, { "epoch": 405.0, "grad_norm": 0.5321841239929199, "learning_rate": 5.45e-06, "loss": 0.3025, "step": 4455 }, { "epoch": 405.0969696969697, "grad_norm": 0.30620166659355164, "learning_rate": 5.44e-06, "loss": 0.3197, "step": 4456 }, { "epoch": 405.1939393939394, "grad_norm": 0.3524479866027832, "learning_rate": 5.4300000000000005e-06, "loss": 0.3438, "step": 4457 }, { "epoch": 405.2909090909091, "grad_norm": 0.27902770042419434, "learning_rate": 5.42e-06, "loss": 0.353, "step": 4458 }, { "epoch": 405.3878787878788, "grad_norm": 0.4012645483016968, "learning_rate": 5.410000000000001e-06, "loss": 0.3244, "step": 4459 }, { "epoch": 405.4848484848485, "grad_norm": 0.26941490173339844, "learning_rate": 5.4e-06, "loss": 0.3164, "step": 4460 }, { "epoch": 405.4848484848485, "eval_loss": 0.42388486862182617, "eval_runtime": 2.1321, "eval_samples_per_second": 25.796, "eval_steps_per_second": 3.283, "step": 4460 }, { "epoch": 405.58181818181816, "grad_norm": 0.30666786432266235, "learning_rate": 5.390000000000001e-06, "loss": 0.328, "step": 4461 }, { "epoch": 405.6787878787879, "grad_norm": 0.2669554650783539, "learning_rate": 5.38e-06, "loss": 0.3195, "step": 4462 }, { "epoch": 405.77575757575755, "grad_norm": 0.34226134419441223, "learning_rate": 5.37e-06, "loss": 0.3309, "step": 4463 }, { "epoch": 405.8727272727273, "grad_norm": 0.3121098577976227, "learning_rate": 5.36e-06, "loss": 0.3022, "step": 4464 }, { "epoch": 405.969696969697, "grad_norm": 0.2962389588356018, "learning_rate": 5.3500000000000004e-06, "loss": 0.3432, "step": 4465 }, { "epoch": 406.0, "grad_norm": 0.5181310772895813, "learning_rate": 5.3400000000000005e-06, "loss": 0.3171, "step": 4466 }, { "epoch": 406.0969696969697, "grad_norm": 0.2788962721824646, "learning_rate": 5.330000000000001e-06, "loss": 0.2907, "step": 4467 }, { "epoch": 406.1939393939394, "grad_norm": 0.23445305228233337, "learning_rate": 5.32e-06, "loss": 0.3339, "step": 4468 }, { "epoch": 406.2909090909091, "grad_norm": 0.2866305112838745, "learning_rate": 5.31e-06, "loss": 0.3472, "step": 4469 }, { "epoch": 406.3878787878788, "grad_norm": 0.27106624841690063, "learning_rate": 5.3e-06, "loss": 0.3226, "step": 4470 }, { "epoch": 406.3878787878788, "eval_loss": 0.42396581172943115, "eval_runtime": 2.1231, "eval_samples_per_second": 25.905, "eval_steps_per_second": 3.297, "step": 4470 }, { "epoch": 406.4848484848485, "grad_norm": 0.2928353548049927, "learning_rate": 5.29e-06, "loss": 0.2981, "step": 4471 }, { "epoch": 406.58181818181816, "grad_norm": 0.27284109592437744, "learning_rate": 5.28e-06, "loss": 0.3282, "step": 4472 }, { "epoch": 406.6787878787879, "grad_norm": 0.33163031935691833, "learning_rate": 5.2699999999999995e-06, "loss": 0.3396, "step": 4473 }, { "epoch": 406.77575757575755, "grad_norm": 0.33143162727355957, "learning_rate": 5.2600000000000005e-06, "loss": 0.3402, "step": 4474 }, { "epoch": 406.8727272727273, "grad_norm": 0.3006884455680847, "learning_rate": 5.25e-06, "loss": 0.321, "step": 4475 }, { "epoch": 406.969696969697, "grad_norm": 0.26408207416534424, "learning_rate": 5.240000000000001e-06, "loss": 0.3447, "step": 4476 }, { "epoch": 407.0, "grad_norm": 0.5091809630393982, "learning_rate": 5.23e-06, "loss": 0.3562, "step": 4477 }, { "epoch": 407.0969696969697, "grad_norm": 0.294497549533844, "learning_rate": 5.220000000000001e-06, "loss": 0.3708, "step": 4478 }, { "epoch": 407.1939393939394, "grad_norm": 0.3147751986980438, "learning_rate": 5.21e-06, "loss": 0.3433, "step": 4479 }, { "epoch": 407.2909090909091, "grad_norm": 0.2634759247303009, "learning_rate": 5.2e-06, "loss": 0.2847, "step": 4480 }, { "epoch": 407.2909090909091, "eval_loss": 0.423898845911026, "eval_runtime": 2.118, "eval_samples_per_second": 25.967, "eval_steps_per_second": 3.305, "step": 4480 }, { "epoch": 407.3878787878788, "grad_norm": 0.26809194684028625, "learning_rate": 5.19e-06, "loss": 0.3095, "step": 4481 }, { "epoch": 407.4848484848485, "grad_norm": 0.30752381682395935, "learning_rate": 5.18e-06, "loss": 0.3521, "step": 4482 }, { "epoch": 407.58181818181816, "grad_norm": 0.2728431820869446, "learning_rate": 5.1700000000000005e-06, "loss": 0.2942, "step": 4483 }, { "epoch": 407.6787878787879, "grad_norm": 0.3156808614730835, "learning_rate": 5.1600000000000006e-06, "loss": 0.3183, "step": 4484 }, { "epoch": 407.77575757575755, "grad_norm": 0.28744783997535706, "learning_rate": 5.15e-06, "loss": 0.3612, "step": 4485 }, { "epoch": 407.8727272727273, "grad_norm": 0.3444119691848755, "learning_rate": 5.140000000000001e-06, "loss": 0.2986, "step": 4486 }, { "epoch": 407.969696969697, "grad_norm": 0.33266812562942505, "learning_rate": 5.13e-06, "loss": 0.3468, "step": 4487 }, { "epoch": 408.0, "grad_norm": 0.46602705121040344, "learning_rate": 5.12e-06, "loss": 0.327, "step": 4488 }, { "epoch": 408.0969696969697, "grad_norm": 0.3970167636871338, "learning_rate": 5.11e-06, "loss": 0.3239, "step": 4489 }, { "epoch": 408.1939393939394, "grad_norm": 0.24424688518047333, "learning_rate": 5.1e-06, "loss": 0.3289, "step": 4490 }, { "epoch": 408.1939393939394, "eval_loss": 0.4239845871925354, "eval_runtime": 2.1275, "eval_samples_per_second": 25.851, "eval_steps_per_second": 3.29, "step": 4490 }, { "epoch": 408.2909090909091, "grad_norm": 0.26348650455474854, "learning_rate": 5.09e-06, "loss": 0.3393, "step": 4491 }, { "epoch": 408.3878787878788, "grad_norm": 0.2709069550037384, "learning_rate": 5.08e-06, "loss": 0.3009, "step": 4492 }, { "epoch": 408.4848484848485, "grad_norm": 0.2873727083206177, "learning_rate": 5.070000000000001e-06, "loss": 0.2914, "step": 4493 }, { "epoch": 408.58181818181816, "grad_norm": 0.36658114194869995, "learning_rate": 5.06e-06, "loss": 0.3573, "step": 4494 }, { "epoch": 408.6787878787879, "grad_norm": 0.2843858003616333, "learning_rate": 5.050000000000001e-06, "loss": 0.3047, "step": 4495 }, { "epoch": 408.77575757575755, "grad_norm": 0.30994489789009094, "learning_rate": 5.04e-06, "loss": 0.3566, "step": 4496 }, { "epoch": 408.8727272727273, "grad_norm": 0.2767634689807892, "learning_rate": 5.03e-06, "loss": 0.3307, "step": 4497 }, { "epoch": 408.969696969697, "grad_norm": 0.3072282075881958, "learning_rate": 5.02e-06, "loss": 0.3204, "step": 4498 }, { "epoch": 409.0, "grad_norm": 0.6475484371185303, "learning_rate": 5.01e-06, "loss": 0.3943, "step": 4499 }, { "epoch": 409.0969696969697, "grad_norm": 0.3126541078090668, "learning_rate": 5e-06, "loss": 0.3417, "step": 4500 }, { "epoch": 409.0969696969697, "eval_loss": 0.4239867925643921, "eval_runtime": 2.1314, "eval_samples_per_second": 25.804, "eval_steps_per_second": 3.284, "step": 4500 }, { "epoch": 409.1939393939394, "grad_norm": 0.31892114877700806, "learning_rate": 4.9900000000000005e-06, "loss": 0.3238, "step": 4501 }, { "epoch": 409.2909090909091, "grad_norm": 0.3017483055591583, "learning_rate": 4.98e-06, "loss": 0.3444, "step": 4502 }, { "epoch": 409.3878787878788, "grad_norm": 0.2635285258293152, "learning_rate": 4.970000000000001e-06, "loss": 0.311, "step": 4503 }, { "epoch": 409.4848484848485, "grad_norm": 0.2573358714580536, "learning_rate": 4.96e-06, "loss": 0.3069, "step": 4504 }, { "epoch": 409.58181818181816, "grad_norm": 0.28138741850852966, "learning_rate": 4.950000000000001e-06, "loss": 0.3414, "step": 4505 }, { "epoch": 409.6787878787879, "grad_norm": 0.24946054816246033, "learning_rate": 4.94e-06, "loss": 0.3883, "step": 4506 }, { "epoch": 409.77575757575755, "grad_norm": 0.33677899837493896, "learning_rate": 4.93e-06, "loss": 0.3041, "step": 4507 }, { "epoch": 409.8727272727273, "grad_norm": 0.30087172985076904, "learning_rate": 4.92e-06, "loss": 0.3296, "step": 4508 }, { "epoch": 409.969696969697, "grad_norm": 0.29604464769363403, "learning_rate": 4.9100000000000004e-06, "loss": 0.2914, "step": 4509 }, { "epoch": 410.0, "grad_norm": 0.5490189790725708, "learning_rate": 4.9000000000000005e-06, "loss": 0.3065, "step": 4510 }, { "epoch": 410.0, "eval_loss": 0.4238167703151703, "eval_runtime": 2.1079, "eval_samples_per_second": 26.092, "eval_steps_per_second": 3.321, "step": 4510 }, { "epoch": 410.0969696969697, "grad_norm": 0.27711138129234314, "learning_rate": 4.89e-06, "loss": 0.3376, "step": 4511 }, { "epoch": 410.1939393939394, "grad_norm": 0.24843066930770874, "learning_rate": 4.880000000000001e-06, "loss": 0.3168, "step": 4512 }, { "epoch": 410.2909090909091, "grad_norm": 0.3280465304851532, "learning_rate": 4.87e-06, "loss": 0.3036, "step": 4513 }, { "epoch": 410.3878787878788, "grad_norm": 0.3124517500400543, "learning_rate": 4.86e-06, "loss": 0.3217, "step": 4514 }, { "epoch": 410.4848484848485, "grad_norm": 0.2721874415874481, "learning_rate": 4.85e-06, "loss": 0.362, "step": 4515 }, { "epoch": 410.58181818181816, "grad_norm": 0.27525606751441956, "learning_rate": 4.84e-06, "loss": 0.3408, "step": 4516 }, { "epoch": 410.6787878787879, "grad_norm": 0.2887386381626129, "learning_rate": 4.83e-06, "loss": 0.3291, "step": 4517 }, { "epoch": 410.77575757575755, "grad_norm": 0.299444317817688, "learning_rate": 4.8200000000000004e-06, "loss": 0.3377, "step": 4518 }, { "epoch": 410.8727272727273, "grad_norm": 0.2675188481807709, "learning_rate": 4.81e-06, "loss": 0.3157, "step": 4519 }, { "epoch": 410.969696969697, "grad_norm": 0.2881135046482086, "learning_rate": 4.800000000000001e-06, "loss": 0.309, "step": 4520 }, { "epoch": 410.969696969697, "eval_loss": 0.42392778396606445, "eval_runtime": 2.1506, "eval_samples_per_second": 25.574, "eval_steps_per_second": 3.255, "step": 4520 }, { "epoch": 411.0, "grad_norm": 0.521273672580719, "learning_rate": 4.79e-06, "loss": 0.3303, "step": 4521 }, { "epoch": 411.0969696969697, "grad_norm": 0.29321396350860596, "learning_rate": 4.780000000000001e-06, "loss": 0.3009, "step": 4522 }, { "epoch": 411.1939393939394, "grad_norm": 0.26560118794441223, "learning_rate": 4.77e-06, "loss": 0.3558, "step": 4523 }, { "epoch": 411.2909090909091, "grad_norm": 0.2869216501712799, "learning_rate": 4.76e-06, "loss": 0.3183, "step": 4524 }, { "epoch": 411.3878787878788, "grad_norm": 0.27583593130111694, "learning_rate": 4.75e-06, "loss": 0.3387, "step": 4525 }, { "epoch": 411.4848484848485, "grad_norm": 0.30815714597702026, "learning_rate": 4.74e-06, "loss": 0.3649, "step": 4526 }, { "epoch": 411.58181818181816, "grad_norm": 0.30801698565483093, "learning_rate": 4.7300000000000005e-06, "loss": 0.3698, "step": 4527 }, { "epoch": 411.6787878787879, "grad_norm": 0.2815544605255127, "learning_rate": 4.72e-06, "loss": 0.284, "step": 4528 }, { "epoch": 411.77575757575755, "grad_norm": 0.2630556523799896, "learning_rate": 4.710000000000001e-06, "loss": 0.3308, "step": 4529 }, { "epoch": 411.8727272727273, "grad_norm": 0.3057383894920349, "learning_rate": 4.7e-06, "loss": 0.3192, "step": 4530 }, { "epoch": 411.8727272727273, "eval_loss": 0.4240568280220032, "eval_runtime": 2.117, "eval_samples_per_second": 25.98, "eval_steps_per_second": 3.306, "step": 4530 }, { "epoch": 411.969696969697, "grad_norm": 0.2713818848133087, "learning_rate": 4.69e-06, "loss": 0.3087, "step": 4531 }, { "epoch": 412.0, "grad_norm": 0.577849268913269, "learning_rate": 4.68e-06, "loss": 0.2719, "step": 4532 }, { "epoch": 412.0969696969697, "grad_norm": 0.2892434597015381, "learning_rate": 4.67e-06, "loss": 0.3245, "step": 4533 }, { "epoch": 412.1939393939394, "grad_norm": 0.25237318873405457, "learning_rate": 4.66e-06, "loss": 0.3474, "step": 4534 }, { "epoch": 412.2909090909091, "grad_norm": 0.3992471992969513, "learning_rate": 4.65e-06, "loss": 0.2817, "step": 4535 }, { "epoch": 412.3878787878788, "grad_norm": 0.3084762692451477, "learning_rate": 4.64e-06, "loss": 0.3598, "step": 4536 }, { "epoch": 412.4848484848485, "grad_norm": 0.31249937415122986, "learning_rate": 4.6300000000000006e-06, "loss": 0.3552, "step": 4537 }, { "epoch": 412.58181818181816, "grad_norm": 0.29190361499786377, "learning_rate": 4.62e-06, "loss": 0.3039, "step": 4538 }, { "epoch": 412.6787878787879, "grad_norm": 0.29947835206985474, "learning_rate": 4.610000000000001e-06, "loss": 0.2769, "step": 4539 }, { "epoch": 412.77575757575755, "grad_norm": 0.30425596237182617, "learning_rate": 4.6e-06, "loss": 0.342, "step": 4540 }, { "epoch": 412.77575757575755, "eval_loss": 0.4239722788333893, "eval_runtime": 2.1347, "eval_samples_per_second": 25.765, "eval_steps_per_second": 3.279, "step": 4540 }, { "epoch": 412.8727272727273, "grad_norm": 0.2873094081878662, "learning_rate": 4.590000000000001e-06, "loss": 0.3116, "step": 4541 }, { "epoch": 412.969696969697, "grad_norm": 0.2804364562034607, "learning_rate": 4.58e-06, "loss": 0.3688, "step": 4542 }, { "epoch": 413.0, "grad_norm": 0.5596648454666138, "learning_rate": 4.57e-06, "loss": 0.3303, "step": 4543 }, { "epoch": 413.0969696969697, "grad_norm": 0.27821826934814453, "learning_rate": 4.56e-06, "loss": 0.3208, "step": 4544 }, { "epoch": 413.1939393939394, "grad_norm": 0.2921523451805115, "learning_rate": 4.5500000000000005e-06, "loss": 0.3403, "step": 4545 }, { "epoch": 413.2909090909091, "grad_norm": 0.2964740991592407, "learning_rate": 4.540000000000001e-06, "loss": 0.34, "step": 4546 }, { "epoch": 413.3878787878788, "grad_norm": 0.2719631493091583, "learning_rate": 4.53e-06, "loss": 0.3357, "step": 4547 }, { "epoch": 413.4848484848485, "grad_norm": 0.2727508246898651, "learning_rate": 4.52e-06, "loss": 0.3318, "step": 4548 }, { "epoch": 413.58181818181816, "grad_norm": 0.27473440766334534, "learning_rate": 4.51e-06, "loss": 0.3354, "step": 4549 }, { "epoch": 413.6787878787879, "grad_norm": 0.3429303467273712, "learning_rate": 4.5e-06, "loss": 0.3306, "step": 4550 }, { "epoch": 413.6787878787879, "eval_loss": 0.4237283170223236, "eval_runtime": 2.1253, "eval_samples_per_second": 25.879, "eval_steps_per_second": 3.294, "step": 4550 }, { "epoch": 413.77575757575755, "grad_norm": 0.25924238562583923, "learning_rate": 4.49e-06, "loss": 0.3218, "step": 4551 }, { "epoch": 413.8727272727273, "grad_norm": 0.3040829002857208, "learning_rate": 4.48e-06, "loss": 0.2817, "step": 4552 }, { "epoch": 413.969696969697, "grad_norm": 0.31921207904815674, "learning_rate": 4.4699999999999996e-06, "loss": 0.3438, "step": 4553 }, { "epoch": 414.0, "grad_norm": 0.4776359498500824, "learning_rate": 4.4600000000000005e-06, "loss": 0.3055, "step": 4554 }, { "epoch": 414.0969696969697, "grad_norm": 0.38110432028770447, "learning_rate": 4.45e-06, "loss": 0.3258, "step": 4555 }, { "epoch": 414.1939393939394, "grad_norm": 0.3182128965854645, "learning_rate": 4.440000000000001e-06, "loss": 0.2973, "step": 4556 }, { "epoch": 414.2909090909091, "grad_norm": 0.2799331247806549, "learning_rate": 4.43e-06, "loss": 0.3438, "step": 4557 }, { "epoch": 414.3878787878788, "grad_norm": 0.2912656366825104, "learning_rate": 4.420000000000001e-06, "loss": 0.3162, "step": 4558 }, { "epoch": 414.4848484848485, "grad_norm": 0.2915773391723633, "learning_rate": 4.41e-06, "loss": 0.3664, "step": 4559 }, { "epoch": 414.58181818181816, "grad_norm": 0.2833085060119629, "learning_rate": 4.4e-06, "loss": 0.3063, "step": 4560 }, { "epoch": 414.58181818181816, "eval_loss": 0.4240158498287201, "eval_runtime": 2.107, "eval_samples_per_second": 26.103, "eval_steps_per_second": 3.322, "step": 4560 }, { "epoch": 414.6787878787879, "grad_norm": 0.28285571932792664, "learning_rate": 4.39e-06, "loss": 0.325, "step": 4561 }, { "epoch": 414.77575757575755, "grad_norm": 0.2957201898097992, "learning_rate": 4.38e-06, "loss": 0.3156, "step": 4562 }, { "epoch": 414.8727272727273, "grad_norm": 0.28091609477996826, "learning_rate": 4.3700000000000005e-06, "loss": 0.3411, "step": 4563 }, { "epoch": 414.969696969697, "grad_norm": 0.3014490306377411, "learning_rate": 4.360000000000001e-06, "loss": 0.3379, "step": 4564 }, { "epoch": 415.0, "grad_norm": 0.4831256866455078, "learning_rate": 4.35e-06, "loss": 0.3259, "step": 4565 }, { "epoch": 415.0969696969697, "grad_norm": 0.27512794733047485, "learning_rate": 4.34e-06, "loss": 0.3098, "step": 4566 }, { "epoch": 415.1939393939394, "grad_norm": 0.28693804144859314, "learning_rate": 4.33e-06, "loss": 0.3027, "step": 4567 }, { "epoch": 415.2909090909091, "grad_norm": 0.32689154148101807, "learning_rate": 4.32e-06, "loss": 0.3277, "step": 4568 }, { "epoch": 415.3878787878788, "grad_norm": 0.32227641344070435, "learning_rate": 4.31e-06, "loss": 0.3702, "step": 4569 }, { "epoch": 415.4848484848485, "grad_norm": 0.271851122379303, "learning_rate": 4.2999999999999995e-06, "loss": 0.3373, "step": 4570 }, { "epoch": 415.4848484848485, "eval_loss": 0.42395830154418945, "eval_runtime": 2.1299, "eval_samples_per_second": 25.823, "eval_steps_per_second": 3.287, "step": 4570 }, { "epoch": 415.58181818181816, "grad_norm": 0.28459352254867554, "learning_rate": 4.2900000000000004e-06, "loss": 0.3204, "step": 4571 }, { "epoch": 415.6787878787879, "grad_norm": 0.2655867636203766, "learning_rate": 4.28e-06, "loss": 0.2963, "step": 4572 }, { "epoch": 415.77575757575755, "grad_norm": 0.29420235753059387, "learning_rate": 4.270000000000001e-06, "loss": 0.3227, "step": 4573 }, { "epoch": 415.8727272727273, "grad_norm": 0.27869269251823425, "learning_rate": 4.26e-06, "loss": 0.3522, "step": 4574 }, { "epoch": 415.969696969697, "grad_norm": 0.27301132678985596, "learning_rate": 4.250000000000001e-06, "loss": 0.3521, "step": 4575 }, { "epoch": 416.0, "grad_norm": 0.517616868019104, "learning_rate": 4.24e-06, "loss": 0.263, "step": 4576 }, { "epoch": 416.0969696969697, "grad_norm": 0.2560671269893646, "learning_rate": 4.23e-06, "loss": 0.3247, "step": 4577 }, { "epoch": 416.1939393939394, "grad_norm": 0.28626900911331177, "learning_rate": 4.22e-06, "loss": 0.3398, "step": 4578 }, { "epoch": 416.2909090909091, "grad_norm": 0.25116318464279175, "learning_rate": 4.21e-06, "loss": 0.3497, "step": 4579 }, { "epoch": 416.3878787878788, "grad_norm": 0.2865499258041382, "learning_rate": 4.2000000000000004e-06, "loss": 0.3258, "step": 4580 }, { "epoch": 416.3878787878788, "eval_loss": 0.4238404333591461, "eval_runtime": 2.1121, "eval_samples_per_second": 26.041, "eval_steps_per_second": 3.314, "step": 4580 }, { "epoch": 416.4848484848485, "grad_norm": 0.2667963206768036, "learning_rate": 4.1900000000000005e-06, "loss": 0.3043, "step": 4581 }, { "epoch": 416.58181818181816, "grad_norm": 0.3104817569255829, "learning_rate": 4.18e-06, "loss": 0.3103, "step": 4582 }, { "epoch": 416.6787878787879, "grad_norm": 0.34828588366508484, "learning_rate": 4.17e-06, "loss": 0.3354, "step": 4583 }, { "epoch": 416.77575757575755, "grad_norm": 0.23135364055633545, "learning_rate": 4.16e-06, "loss": 0.3084, "step": 4584 }, { "epoch": 416.8727272727273, "grad_norm": 0.2692182660102844, "learning_rate": 4.15e-06, "loss": 0.3481, "step": 4585 }, { "epoch": 416.969696969697, "grad_norm": 0.300054132938385, "learning_rate": 4.14e-06, "loss": 0.3198, "step": 4586 }, { "epoch": 417.0, "grad_norm": 0.6591640710830688, "learning_rate": 4.13e-06, "loss": 0.3431, "step": 4587 }, { "epoch": 417.0969696969697, "grad_norm": 0.2736823558807373, "learning_rate": 4.12e-06, "loss": 0.3235, "step": 4588 }, { "epoch": 417.1939393939394, "grad_norm": 0.30306321382522583, "learning_rate": 4.11e-06, "loss": 0.3591, "step": 4589 }, { "epoch": 417.2909090909091, "grad_norm": 0.2945916950702667, "learning_rate": 4.1000000000000006e-06, "loss": 0.3196, "step": 4590 }, { "epoch": 417.2909090909091, "eval_loss": 0.42390501499176025, "eval_runtime": 2.1312, "eval_samples_per_second": 25.807, "eval_steps_per_second": 3.285, "step": 4590 }, { "epoch": 417.3878787878788, "grad_norm": 0.3049025535583496, "learning_rate": 4.09e-06, "loss": 0.2935, "step": 4591 }, { "epoch": 417.4848484848485, "grad_norm": 0.2787110507488251, "learning_rate": 4.080000000000001e-06, "loss": 0.314, "step": 4592 }, { "epoch": 417.58181818181816, "grad_norm": 0.2911776006221771, "learning_rate": 4.07e-06, "loss": 0.3078, "step": 4593 }, { "epoch": 417.6787878787879, "grad_norm": 0.26729869842529297, "learning_rate": 4.06e-06, "loss": 0.3457, "step": 4594 }, { "epoch": 417.77575757575755, "grad_norm": 0.2834504544734955, "learning_rate": 4.05e-06, "loss": 0.3142, "step": 4595 }, { "epoch": 417.8727272727273, "grad_norm": 0.28690284490585327, "learning_rate": 4.04e-06, "loss": 0.3359, "step": 4596 }, { "epoch": 417.969696969697, "grad_norm": 0.2627941071987152, "learning_rate": 4.03e-06, "loss": 0.3468, "step": 4597 }, { "epoch": 418.0, "grad_norm": 0.5440768003463745, "learning_rate": 4.0200000000000005e-06, "loss": 0.3624, "step": 4598 }, { "epoch": 418.0969696969697, "grad_norm": 0.27621591091156006, "learning_rate": 4.01e-06, "loss": 0.3036, "step": 4599 }, { "epoch": 418.1939393939394, "grad_norm": 0.29796677827835083, "learning_rate": 4.000000000000001e-06, "loss": 0.336, "step": 4600 }, { "epoch": 418.1939393939394, "eval_loss": 0.4240822494029999, "eval_runtime": 2.1218, "eval_samples_per_second": 25.922, "eval_steps_per_second": 3.299, "step": 4600 }, { "epoch": 418.2909090909091, "grad_norm": 0.26402226090431213, "learning_rate": 3.99e-06, "loss": 0.3005, "step": 4601 }, { "epoch": 418.3878787878788, "grad_norm": 0.29714128375053406, "learning_rate": 3.98e-06, "loss": 0.3396, "step": 4602 }, { "epoch": 418.4848484848485, "grad_norm": 0.2947816848754883, "learning_rate": 3.97e-06, "loss": 0.3243, "step": 4603 }, { "epoch": 418.58181818181816, "grad_norm": 0.32920148968696594, "learning_rate": 3.96e-06, "loss": 0.346, "step": 4604 }, { "epoch": 418.6787878787879, "grad_norm": 0.3153909146785736, "learning_rate": 3.95e-06, "loss": 0.3285, "step": 4605 }, { "epoch": 418.77575757575755, "grad_norm": 0.28215932846069336, "learning_rate": 3.9399999999999995e-06, "loss": 0.3619, "step": 4606 }, { "epoch": 418.8727272727273, "grad_norm": 0.2899491786956787, "learning_rate": 3.9300000000000005e-06, "loss": 0.3283, "step": 4607 }, { "epoch": 418.969696969697, "grad_norm": 0.2625412940979004, "learning_rate": 3.92e-06, "loss": 0.2992, "step": 4608 }, { "epoch": 419.0, "grad_norm": 0.6752861142158508, "learning_rate": 3.910000000000001e-06, "loss": 0.3287, "step": 4609 }, { "epoch": 419.0969696969697, "grad_norm": 0.2802707254886627, "learning_rate": 3.9e-06, "loss": 0.339, "step": 4610 }, { "epoch": 419.0969696969697, "eval_loss": 0.4236716330051422, "eval_runtime": 2.1068, "eval_samples_per_second": 26.106, "eval_steps_per_second": 3.323, "step": 4610 }, { "epoch": 419.1939393939394, "grad_norm": 0.26663121581077576, "learning_rate": 3.89e-06, "loss": 0.3388, "step": 4611 }, { "epoch": 419.2909090909091, "grad_norm": 0.31221041083335876, "learning_rate": 3.88e-06, "loss": 0.2823, "step": 4612 }, { "epoch": 419.3878787878788, "grad_norm": 0.3001995086669922, "learning_rate": 3.87e-06, "loss": 0.3422, "step": 4613 }, { "epoch": 419.4848484848485, "grad_norm": 0.29675009846687317, "learning_rate": 3.86e-06, "loss": 0.3397, "step": 4614 }, { "epoch": 419.58181818181816, "grad_norm": 0.3295038938522339, "learning_rate": 3.85e-06, "loss": 0.3224, "step": 4615 }, { "epoch": 419.6787878787879, "grad_norm": 0.2736305892467499, "learning_rate": 3.84e-06, "loss": 0.317, "step": 4616 }, { "epoch": 419.77575757575755, "grad_norm": 0.32741448283195496, "learning_rate": 3.830000000000001e-06, "loss": 0.3323, "step": 4617 }, { "epoch": 419.8727272727273, "grad_norm": 0.2668072581291199, "learning_rate": 3.82e-06, "loss": 0.3605, "step": 4618 }, { "epoch": 419.969696969697, "grad_norm": 0.29451069235801697, "learning_rate": 3.8100000000000004e-06, "loss": 0.293, "step": 4619 }, { "epoch": 420.0, "grad_norm": 0.40569227933883667, "learning_rate": 3.8e-06, "loss": 0.3387, "step": 4620 }, { "epoch": 420.0, "eval_loss": 0.42370980978012085, "eval_runtime": 2.116, "eval_samples_per_second": 25.992, "eval_steps_per_second": 3.308, "step": 4620 }, { "epoch": 420.0969696969697, "grad_norm": 0.27761849761009216, "learning_rate": 3.7900000000000006e-06, "loss": 0.3157, "step": 4621 }, { "epoch": 420.1939393939394, "grad_norm": 0.31935086846351624, "learning_rate": 3.7800000000000002e-06, "loss": 0.3145, "step": 4622 }, { "epoch": 420.2909090909091, "grad_norm": 0.23702049255371094, "learning_rate": 3.77e-06, "loss": 0.2684, "step": 4623 }, { "epoch": 420.3878787878788, "grad_norm": 0.2601977586746216, "learning_rate": 3.7600000000000004e-06, "loss": 0.3297, "step": 4624 }, { "epoch": 420.4848484848485, "grad_norm": 0.26816099882125854, "learning_rate": 3.75e-06, "loss": 0.3325, "step": 4625 }, { "epoch": 420.58181818181816, "grad_norm": 0.2610798180103302, "learning_rate": 3.7400000000000006e-06, "loss": 0.2982, "step": 4626 }, { "epoch": 420.6787878787879, "grad_norm": 0.2762024700641632, "learning_rate": 3.7300000000000003e-06, "loss": 0.3697, "step": 4627 }, { "epoch": 420.77575757575755, "grad_norm": 0.29188522696495056, "learning_rate": 3.72e-06, "loss": 0.3499, "step": 4628 }, { "epoch": 420.8727272727273, "grad_norm": 0.30248361825942993, "learning_rate": 3.7100000000000005e-06, "loss": 0.3439, "step": 4629 }, { "epoch": 420.969696969697, "grad_norm": 0.29372766613960266, "learning_rate": 3.7e-06, "loss": 0.3216, "step": 4630 }, { "epoch": 420.969696969697, "eval_loss": 0.4238877594470978, "eval_runtime": 2.1381, "eval_samples_per_second": 25.724, "eval_steps_per_second": 3.274, "step": 4630 }, { "epoch": 421.0, "grad_norm": 0.4670877158641815, "learning_rate": 3.6900000000000002e-06, "loss": 0.3945, "step": 4631 }, { "epoch": 421.0969696969697, "grad_norm": 0.3275664150714874, "learning_rate": 3.68e-06, "loss": 0.3402, "step": 4632 }, { "epoch": 421.1939393939394, "grad_norm": 0.302420437335968, "learning_rate": 3.6700000000000004e-06, "loss": 0.3289, "step": 4633 }, { "epoch": 421.2909090909091, "grad_norm": 0.3075823485851288, "learning_rate": 3.66e-06, "loss": 0.2901, "step": 4634 }, { "epoch": 421.3878787878788, "grad_norm": 0.31915780901908875, "learning_rate": 3.6499999999999998e-06, "loss": 0.3143, "step": 4635 }, { "epoch": 421.4848484848485, "grad_norm": 0.2630278766155243, "learning_rate": 3.6400000000000003e-06, "loss": 0.3389, "step": 4636 }, { "epoch": 421.58181818181816, "grad_norm": 0.26861655712127686, "learning_rate": 3.63e-06, "loss": 0.3433, "step": 4637 }, { "epoch": 421.6787878787879, "grad_norm": 0.27271535992622375, "learning_rate": 3.6200000000000005e-06, "loss": 0.2828, "step": 4638 }, { "epoch": 421.77575757575755, "grad_norm": 0.28074952960014343, "learning_rate": 3.61e-06, "loss": 0.3431, "step": 4639 }, { "epoch": 421.8727272727273, "grad_norm": 0.29417550563812256, "learning_rate": 3.6e-06, "loss": 0.3302, "step": 4640 }, { "epoch": 421.8727272727273, "eval_loss": 0.4238133728504181, "eval_runtime": 2.1098, "eval_samples_per_second": 26.068, "eval_steps_per_second": 3.318, "step": 4640 }, { "epoch": 421.969696969697, "grad_norm": 0.29620280861854553, "learning_rate": 3.5900000000000004e-06, "loss": 0.3514, "step": 4641 }, { "epoch": 422.0, "grad_norm": 0.6159494519233704, "learning_rate": 3.58e-06, "loss": 0.3477, "step": 4642 }, { "epoch": 422.0969696969697, "grad_norm": 0.3032165765762329, "learning_rate": 3.5700000000000005e-06, "loss": 0.343, "step": 4643 }, { "epoch": 422.1939393939394, "grad_norm": 0.2550334632396698, "learning_rate": 3.5600000000000002e-06, "loss": 0.3302, "step": 4644 }, { "epoch": 422.2909090909091, "grad_norm": 0.27611789107322693, "learning_rate": 3.55e-06, "loss": 0.3381, "step": 4645 }, { "epoch": 422.3878787878788, "grad_norm": 0.28917229175567627, "learning_rate": 3.5400000000000004e-06, "loss": 0.326, "step": 4646 }, { "epoch": 422.4848484848485, "grad_norm": 0.3703041672706604, "learning_rate": 3.53e-06, "loss": 0.337, "step": 4647 }, { "epoch": 422.58181818181816, "grad_norm": 0.24577577412128448, "learning_rate": 3.52e-06, "loss": 0.3145, "step": 4648 }, { "epoch": 422.6787878787879, "grad_norm": 0.27131393551826477, "learning_rate": 3.5100000000000003e-06, "loss": 0.3543, "step": 4649 }, { "epoch": 422.77575757575755, "grad_norm": 0.24354679882526398, "learning_rate": 3.5000000000000004e-06, "loss": 0.2833, "step": 4650 }, { "epoch": 422.77575757575755, "eval_loss": 0.42374733090400696, "eval_runtime": 2.134, "eval_samples_per_second": 25.773, "eval_steps_per_second": 3.28, "step": 4650 }, { "epoch": 422.8727272727273, "grad_norm": 0.2760464549064636, "learning_rate": 3.49e-06, "loss": 0.309, "step": 4651 }, { "epoch": 422.969696969697, "grad_norm": 0.30734682083129883, "learning_rate": 3.4799999999999997e-06, "loss": 0.3718, "step": 4652 }, { "epoch": 423.0, "grad_norm": 0.4714656472206116, "learning_rate": 3.4700000000000002e-06, "loss": 0.2027, "step": 4653 }, { "epoch": 423.0969696969697, "grad_norm": 0.2786318361759186, "learning_rate": 3.46e-06, "loss": 0.3422, "step": 4654 }, { "epoch": 423.1939393939394, "grad_norm": 0.31899377703666687, "learning_rate": 3.4500000000000004e-06, "loss": 0.3328, "step": 4655 }, { "epoch": 423.2909090909091, "grad_norm": 0.2987779378890991, "learning_rate": 3.44e-06, "loss": 0.3495, "step": 4656 }, { "epoch": 423.3878787878788, "grad_norm": 0.3024296462535858, "learning_rate": 3.4299999999999998e-06, "loss": 0.289, "step": 4657 }, { "epoch": 423.4848484848485, "grad_norm": 0.33767056465148926, "learning_rate": 3.4200000000000003e-06, "loss": 0.3157, "step": 4658 }, { "epoch": 423.58181818181816, "grad_norm": 0.2794208228588104, "learning_rate": 3.41e-06, "loss": 0.3241, "step": 4659 }, { "epoch": 423.6787878787879, "grad_norm": 0.2814173698425293, "learning_rate": 3.4000000000000005e-06, "loss": 0.332, "step": 4660 }, { "epoch": 423.6787878787879, "eval_loss": 0.4236131012439728, "eval_runtime": 2.1268, "eval_samples_per_second": 25.861, "eval_steps_per_second": 3.291, "step": 4660 }, { "epoch": 423.77575757575755, "grad_norm": 0.2672629952430725, "learning_rate": 3.39e-06, "loss": 0.3408, "step": 4661 }, { "epoch": 423.8727272727273, "grad_norm": 0.26779016852378845, "learning_rate": 3.38e-06, "loss": 0.3353, "step": 4662 }, { "epoch": 423.969696969697, "grad_norm": 0.27753445506095886, "learning_rate": 3.3700000000000003e-06, "loss": 0.3237, "step": 4663 }, { "epoch": 424.0, "grad_norm": 0.4725082218647003, "learning_rate": 3.36e-06, "loss": 0.2746, "step": 4664 }, { "epoch": 424.0969696969697, "grad_norm": 0.28391534090042114, "learning_rate": 3.3500000000000005e-06, "loss": 0.3008, "step": 4665 }, { "epoch": 424.1939393939394, "grad_norm": 0.29803451895713806, "learning_rate": 3.34e-06, "loss": 0.3041, "step": 4666 }, { "epoch": 424.2909090909091, "grad_norm": 0.27545270323753357, "learning_rate": 3.3300000000000003e-06, "loss": 0.308, "step": 4667 }, { "epoch": 424.3878787878788, "grad_norm": 0.28638580441474915, "learning_rate": 3.3200000000000004e-06, "loss": 0.381, "step": 4668 }, { "epoch": 424.4848484848485, "grad_norm": 0.24369746446609497, "learning_rate": 3.31e-06, "loss": 0.298, "step": 4669 }, { "epoch": 424.58181818181816, "grad_norm": 0.27538207173347473, "learning_rate": 3.3e-06, "loss": 0.3337, "step": 4670 }, { "epoch": 424.58181818181816, "eval_loss": 0.4239676594734192, "eval_runtime": 2.1146, "eval_samples_per_second": 26.01, "eval_steps_per_second": 3.31, "step": 4670 }, { "epoch": 424.6787878787879, "grad_norm": 0.31407785415649414, "learning_rate": 3.29e-06, "loss": 0.3383, "step": 4671 }, { "epoch": 424.77575757575755, "grad_norm": 0.30840155482292175, "learning_rate": 3.2800000000000004e-06, "loss": 0.306, "step": 4672 }, { "epoch": 424.8727272727273, "grad_norm": 0.27344176173210144, "learning_rate": 3.27e-06, "loss": 0.331, "step": 4673 }, { "epoch": 424.969696969697, "grad_norm": 0.3199271857738495, "learning_rate": 3.2599999999999997e-06, "loss": 0.36, "step": 4674 }, { "epoch": 425.0, "grad_norm": 0.4382783770561218, "learning_rate": 3.2500000000000002e-06, "loss": 0.35, "step": 4675 }, { "epoch": 425.0969696969697, "grad_norm": 0.30849358439445496, "learning_rate": 3.24e-06, "loss": 0.3223, "step": 4676 }, { "epoch": 425.1939393939394, "grad_norm": 0.28263401985168457, "learning_rate": 3.2300000000000004e-06, "loss": 0.3385, "step": 4677 }, { "epoch": 425.2909090909091, "grad_norm": 0.33110207319259644, "learning_rate": 3.22e-06, "loss": 0.3222, "step": 4678 }, { "epoch": 425.3878787878788, "grad_norm": 0.25571611523628235, "learning_rate": 3.2099999999999998e-06, "loss": 0.3254, "step": 4679 }, { "epoch": 425.4848484848485, "grad_norm": 0.3212851285934448, "learning_rate": 3.2000000000000003e-06, "loss": 0.3227, "step": 4680 }, { "epoch": 425.4848484848485, "eval_loss": 0.4238819181919098, "eval_runtime": 2.1317, "eval_samples_per_second": 25.801, "eval_steps_per_second": 3.284, "step": 4680 }, { "epoch": 425.58181818181816, "grad_norm": 0.3157774806022644, "learning_rate": 3.19e-06, "loss": 0.3397, "step": 4681 }, { "epoch": 425.6787878787879, "grad_norm": 0.2688293755054474, "learning_rate": 3.1800000000000005e-06, "loss": 0.3423, "step": 4682 }, { "epoch": 425.77575757575755, "grad_norm": 0.2958885729312897, "learning_rate": 3.17e-06, "loss": 0.3099, "step": 4683 }, { "epoch": 425.8727272727273, "grad_norm": 0.2832617163658142, "learning_rate": 3.1600000000000007e-06, "loss": 0.3393, "step": 4684 }, { "epoch": 425.969696969697, "grad_norm": 0.32423144578933716, "learning_rate": 3.1500000000000003e-06, "loss": 0.3021, "step": 4685 }, { "epoch": 426.0, "grad_norm": 0.4971548318862915, "learning_rate": 3.14e-06, "loss": 0.3438, "step": 4686 }, { "epoch": 426.0969696969697, "grad_norm": 0.276876837015152, "learning_rate": 3.13e-06, "loss": 0.3225, "step": 4687 }, { "epoch": 426.1939393939394, "grad_norm": 0.2893661856651306, "learning_rate": 3.12e-06, "loss": 0.3164, "step": 4688 }, { "epoch": 426.2909090909091, "grad_norm": 0.3256908357143402, "learning_rate": 3.11e-06, "loss": 0.3228, "step": 4689 }, { "epoch": 426.3878787878788, "grad_norm": 0.30764877796173096, "learning_rate": 3.1e-06, "loss": 0.3429, "step": 4690 }, { "epoch": 426.3878787878788, "eval_loss": 0.4238271415233612, "eval_runtime": 2.1183, "eval_samples_per_second": 25.965, "eval_steps_per_second": 3.305, "step": 4690 }, { "epoch": 426.4848484848485, "grad_norm": 0.25312313437461853, "learning_rate": 3.09e-06, "loss": 0.318, "step": 4691 }, { "epoch": 426.58181818181816, "grad_norm": 0.32886362075805664, "learning_rate": 3.08e-06, "loss": 0.3334, "step": 4692 }, { "epoch": 426.6787878787879, "grad_norm": 0.2524215579032898, "learning_rate": 3.0700000000000003e-06, "loss": 0.3206, "step": 4693 }, { "epoch": 426.77575757575755, "grad_norm": 0.31209561228752136, "learning_rate": 3.06e-06, "loss": 0.324, "step": 4694 }, { "epoch": 426.8727272727273, "grad_norm": 0.2784077823162079, "learning_rate": 3.05e-06, "loss": 0.3411, "step": 4695 }, { "epoch": 426.969696969697, "grad_norm": 0.2601439952850342, "learning_rate": 3.04e-06, "loss": 0.3427, "step": 4696 }, { "epoch": 427.0, "grad_norm": 0.5130738019943237, "learning_rate": 3.0300000000000002e-06, "loss": 0.2719, "step": 4697 }, { "epoch": 427.0969696969697, "grad_norm": 0.26737532019615173, "learning_rate": 3.0200000000000003e-06, "loss": 0.3273, "step": 4698 }, { "epoch": 427.1939393939394, "grad_norm": 0.2933454215526581, "learning_rate": 3.01e-06, "loss": 0.3311, "step": 4699 }, { "epoch": 427.2909090909091, "grad_norm": 0.30595117807388306, "learning_rate": 3e-06, "loss": 0.3515, "step": 4700 }, { "epoch": 427.2909090909091, "eval_loss": 0.4236823320388794, "eval_runtime": 2.1384, "eval_samples_per_second": 25.721, "eval_steps_per_second": 3.274, "step": 4700 }, { "epoch": 427.3878787878788, "grad_norm": 0.31341302394866943, "learning_rate": 2.99e-06, "loss": 0.3082, "step": 4701 }, { "epoch": 427.4848484848485, "grad_norm": 0.28494933247566223, "learning_rate": 2.9800000000000003e-06, "loss": 0.3256, "step": 4702 }, { "epoch": 427.58181818181816, "grad_norm": 0.2777785062789917, "learning_rate": 2.9700000000000004e-06, "loss": 0.3099, "step": 4703 }, { "epoch": 427.6787878787879, "grad_norm": 0.25094926357269287, "learning_rate": 2.9600000000000005e-06, "loss": 0.3432, "step": 4704 }, { "epoch": 427.77575757575755, "grad_norm": 0.29807037115097046, "learning_rate": 2.95e-06, "loss": 0.315, "step": 4705 }, { "epoch": 427.8727272727273, "grad_norm": 0.3128386437892914, "learning_rate": 2.9400000000000002e-06, "loss": 0.3334, "step": 4706 }, { "epoch": 427.969696969697, "grad_norm": 0.3074195981025696, "learning_rate": 2.93e-06, "loss": 0.3195, "step": 4707 }, { "epoch": 428.0, "grad_norm": 0.4899100959300995, "learning_rate": 2.92e-06, "loss": 0.3367, "step": 4708 }, { "epoch": 428.0969696969697, "grad_norm": 0.2630816400051117, "learning_rate": 2.91e-06, "loss": 0.3628, "step": 4709 }, { "epoch": 428.1939393939394, "grad_norm": 0.3018128573894501, "learning_rate": 2.9e-06, "loss": 0.3246, "step": 4710 }, { "epoch": 428.1939393939394, "eval_loss": 0.42355218529701233, "eval_runtime": 2.1113, "eval_samples_per_second": 26.05, "eval_steps_per_second": 3.315, "step": 4710 }, { "epoch": 428.2909090909091, "grad_norm": 0.25907474756240845, "learning_rate": 2.89e-06, "loss": 0.3417, "step": 4711 }, { "epoch": 428.3878787878788, "grad_norm": 0.31002143025398254, "learning_rate": 2.88e-06, "loss": 0.3136, "step": 4712 }, { "epoch": 428.4848484848485, "grad_norm": 0.24571257829666138, "learning_rate": 2.87e-06, "loss": 0.3607, "step": 4713 }, { "epoch": 428.58181818181816, "grad_norm": 0.2842963933944702, "learning_rate": 2.86e-06, "loss": 0.296, "step": 4714 }, { "epoch": 428.6787878787879, "grad_norm": 0.36192312836647034, "learning_rate": 2.8500000000000002e-06, "loss": 0.3295, "step": 4715 }, { "epoch": 428.77575757575755, "grad_norm": 0.2653605043888092, "learning_rate": 2.8400000000000003e-06, "loss": 0.3109, "step": 4716 }, { "epoch": 428.8727272727273, "grad_norm": 0.29872456192970276, "learning_rate": 2.83e-06, "loss": 0.319, "step": 4717 }, { "epoch": 428.969696969697, "grad_norm": 0.2873593866825104, "learning_rate": 2.82e-06, "loss": 0.3029, "step": 4718 }, { "epoch": 429.0, "grad_norm": 0.5015831589698792, "learning_rate": 2.81e-06, "loss": 0.3301, "step": 4719 }, { "epoch": 429.0969696969697, "grad_norm": 0.26784220337867737, "learning_rate": 2.8000000000000003e-06, "loss": 0.3132, "step": 4720 }, { "epoch": 429.0969696969697, "eval_loss": 0.42372432351112366, "eval_runtime": 2.1006, "eval_samples_per_second": 26.183, "eval_steps_per_second": 3.332, "step": 4720 }, { "epoch": 429.1939393939394, "grad_norm": 0.3056507706642151, "learning_rate": 2.7900000000000004e-06, "loss": 0.3011, "step": 4721 }, { "epoch": 429.2909090909091, "grad_norm": 0.30956873297691345, "learning_rate": 2.78e-06, "loss": 0.3318, "step": 4722 }, { "epoch": 429.3878787878788, "grad_norm": 0.28301090002059937, "learning_rate": 2.77e-06, "loss": 0.3312, "step": 4723 }, { "epoch": 429.4848484848485, "grad_norm": 0.30617964267730713, "learning_rate": 2.7600000000000003e-06, "loss": 0.3547, "step": 4724 }, { "epoch": 429.58181818181816, "grad_norm": 0.31053993105888367, "learning_rate": 2.7500000000000004e-06, "loss": 0.3213, "step": 4725 }, { "epoch": 429.6787878787879, "grad_norm": 0.269156277179718, "learning_rate": 2.74e-06, "loss": 0.3257, "step": 4726 }, { "epoch": 429.77575757575755, "grad_norm": 0.25708848237991333, "learning_rate": 2.73e-06, "loss": 0.3304, "step": 4727 }, { "epoch": 429.8727272727273, "grad_norm": 0.2988244891166687, "learning_rate": 2.72e-06, "loss": 0.3393, "step": 4728 }, { "epoch": 429.969696969697, "grad_norm": 0.2603486180305481, "learning_rate": 2.71e-06, "loss": 0.3167, "step": 4729 }, { "epoch": 430.0, "grad_norm": 0.4041363000869751, "learning_rate": 2.7e-06, "loss": 0.3297, "step": 4730 }, { "epoch": 430.0, "eval_loss": 0.42374521493911743, "eval_runtime": 2.1174, "eval_samples_per_second": 25.976, "eval_steps_per_second": 3.306, "step": 4730 }, { "epoch": 430.0969696969697, "grad_norm": 0.27298206090927124, "learning_rate": 2.69e-06, "loss": 0.3604, "step": 4731 }, { "epoch": 430.1939393939394, "grad_norm": 0.2766759693622589, "learning_rate": 2.68e-06, "loss": 0.3354, "step": 4732 }, { "epoch": 430.2909090909091, "grad_norm": 0.26006969809532166, "learning_rate": 2.6700000000000003e-06, "loss": 0.3173, "step": 4733 }, { "epoch": 430.3878787878788, "grad_norm": 0.2893983721733093, "learning_rate": 2.66e-06, "loss": 0.3294, "step": 4734 }, { "epoch": 430.4848484848485, "grad_norm": 0.2810271680355072, "learning_rate": 2.65e-06, "loss": 0.3395, "step": 4735 }, { "epoch": 430.58181818181816, "grad_norm": 0.29783904552459717, "learning_rate": 2.64e-06, "loss": 0.3079, "step": 4736 }, { "epoch": 430.6787878787879, "grad_norm": 0.2597369849681854, "learning_rate": 2.6300000000000002e-06, "loss": 0.2866, "step": 4737 }, { "epoch": 430.77575757575755, "grad_norm": 0.27778446674346924, "learning_rate": 2.6200000000000003e-06, "loss": 0.3203, "step": 4738 }, { "epoch": 430.8727272727273, "grad_norm": 0.34003597497940063, "learning_rate": 2.6100000000000004e-06, "loss": 0.3483, "step": 4739 }, { "epoch": 430.969696969697, "grad_norm": 0.2701445519924164, "learning_rate": 2.6e-06, "loss": 0.3032, "step": 4740 }, { "epoch": 430.969696969697, "eval_loss": 0.4238017797470093, "eval_runtime": 2.1068, "eval_samples_per_second": 26.106, "eval_steps_per_second": 3.323, "step": 4740 }, { "epoch": 431.0, "grad_norm": 0.5419566035270691, "learning_rate": 2.59e-06, "loss": 0.3887, "step": 4741 }, { "epoch": 431.0969696969697, "grad_norm": 0.31240081787109375, "learning_rate": 2.5800000000000003e-06, "loss": 0.3338, "step": 4742 }, { "epoch": 431.1939393939394, "grad_norm": 0.28262385725975037, "learning_rate": 2.5700000000000004e-06, "loss": 0.3193, "step": 4743 }, { "epoch": 431.2909090909091, "grad_norm": 0.28926101326942444, "learning_rate": 2.56e-06, "loss": 0.3167, "step": 4744 }, { "epoch": 431.3878787878788, "grad_norm": 0.2820095419883728, "learning_rate": 2.55e-06, "loss": 0.3244, "step": 4745 }, { "epoch": 431.4848484848485, "grad_norm": 0.26937371492385864, "learning_rate": 2.54e-06, "loss": 0.3157, "step": 4746 }, { "epoch": 431.58181818181816, "grad_norm": 0.28882157802581787, "learning_rate": 2.53e-06, "loss": 0.3016, "step": 4747 }, { "epoch": 431.6787878787879, "grad_norm": 0.2726987600326538, "learning_rate": 2.52e-06, "loss": 0.3514, "step": 4748 }, { "epoch": 431.77575757575755, "grad_norm": 0.28297871351242065, "learning_rate": 2.51e-06, "loss": 0.335, "step": 4749 }, { "epoch": 431.8727272727273, "grad_norm": 0.2559443712234497, "learning_rate": 2.5e-06, "loss": 0.3212, "step": 4750 }, { "epoch": 431.8727272727273, "eval_loss": 0.4235769510269165, "eval_runtime": 2.108, "eval_samples_per_second": 26.091, "eval_steps_per_second": 3.321, "step": 4750 }, { "epoch": 431.969696969697, "grad_norm": 0.3145511746406555, "learning_rate": 2.49e-06, "loss": 0.3164, "step": 4751 }, { "epoch": 432.0, "grad_norm": 0.5010744333267212, "learning_rate": 2.48e-06, "loss": 0.4194, "step": 4752 }, { "epoch": 432.0969696969697, "grad_norm": 0.30744919180870056, "learning_rate": 2.47e-06, "loss": 0.3192, "step": 4753 }, { "epoch": 432.1939393939394, "grad_norm": 0.2515227794647217, "learning_rate": 2.46e-06, "loss": 0.3254, "step": 4754 }, { "epoch": 432.2909090909091, "grad_norm": 0.3341293931007385, "learning_rate": 2.4500000000000003e-06, "loss": 0.3559, "step": 4755 }, { "epoch": 432.3878787878788, "grad_norm": 0.2512374222278595, "learning_rate": 2.4400000000000004e-06, "loss": 0.3434, "step": 4756 }, { "epoch": 432.4848484848485, "grad_norm": 0.2560592591762543, "learning_rate": 2.43e-06, "loss": 0.3145, "step": 4757 }, { "epoch": 432.58181818181816, "grad_norm": 0.27512484788894653, "learning_rate": 2.42e-06, "loss": 0.3072, "step": 4758 }, { "epoch": 432.6787878787879, "grad_norm": 0.3230624198913574, "learning_rate": 2.4100000000000002e-06, "loss": 0.3542, "step": 4759 }, { "epoch": 432.77575757575755, "grad_norm": 0.3115209639072418, "learning_rate": 2.4000000000000003e-06, "loss": 0.323, "step": 4760 }, { "epoch": 432.77575757575755, "eval_loss": 0.42355164885520935, "eval_runtime": 2.1163, "eval_samples_per_second": 25.989, "eval_steps_per_second": 3.308, "step": 4760 }, { "epoch": 432.8727272727273, "grad_norm": 0.26935118436813354, "learning_rate": 2.3900000000000004e-06, "loss": 0.2812, "step": 4761 }, { "epoch": 432.969696969697, "grad_norm": 0.2984224557876587, "learning_rate": 2.38e-06, "loss": 0.3334, "step": 4762 }, { "epoch": 433.0, "grad_norm": 0.44937461614608765, "learning_rate": 2.37e-06, "loss": 0.3498, "step": 4763 }, { "epoch": 433.0969696969697, "grad_norm": 0.29262152314186096, "learning_rate": 2.36e-06, "loss": 0.3392, "step": 4764 }, { "epoch": 433.1939393939394, "grad_norm": 0.30966514348983765, "learning_rate": 2.35e-06, "loss": 0.3217, "step": 4765 }, { "epoch": 433.2909090909091, "grad_norm": 0.2939615845680237, "learning_rate": 2.34e-06, "loss": 0.3118, "step": 4766 }, { "epoch": 433.3878787878788, "grad_norm": 0.2691563367843628, "learning_rate": 2.33e-06, "loss": 0.3595, "step": 4767 }, { "epoch": 433.4848484848485, "grad_norm": 0.30199316143989563, "learning_rate": 2.32e-06, "loss": 0.3523, "step": 4768 }, { "epoch": 433.58181818181816, "grad_norm": 0.2727270722389221, "learning_rate": 2.31e-06, "loss": 0.3091, "step": 4769 }, { "epoch": 433.6787878787879, "grad_norm": 0.28975585103034973, "learning_rate": 2.3e-06, "loss": 0.3097, "step": 4770 }, { "epoch": 433.6787878787879, "eval_loss": 0.42369014024734497, "eval_runtime": 2.1122, "eval_samples_per_second": 26.039, "eval_steps_per_second": 3.314, "step": 4770 }, { "epoch": 433.77575757575755, "grad_norm": 0.2741400897502899, "learning_rate": 2.29e-06, "loss": 0.3175, "step": 4771 }, { "epoch": 433.8727272727273, "grad_norm": 0.3120625913143158, "learning_rate": 2.28e-06, "loss": 0.3066, "step": 4772 }, { "epoch": 433.969696969697, "grad_norm": 0.3074108362197876, "learning_rate": 2.2700000000000003e-06, "loss": 0.328, "step": 4773 }, { "epoch": 434.0, "grad_norm": 0.5580938458442688, "learning_rate": 2.26e-06, "loss": 0.3553, "step": 4774 }, { "epoch": 434.0969696969697, "grad_norm": 0.2596130073070526, "learning_rate": 2.25e-06, "loss": 0.3386, "step": 4775 }, { "epoch": 434.1939393939394, "grad_norm": 0.31736263632774353, "learning_rate": 2.24e-06, "loss": 0.3018, "step": 4776 }, { "epoch": 434.2909090909091, "grad_norm": 0.2828652560710907, "learning_rate": 2.2300000000000002e-06, "loss": 0.3408, "step": 4777 }, { "epoch": 434.3878787878788, "grad_norm": 0.2554495334625244, "learning_rate": 2.2200000000000003e-06, "loss": 0.3246, "step": 4778 }, { "epoch": 434.4848484848485, "grad_norm": 0.27445319294929504, "learning_rate": 2.2100000000000004e-06, "loss": 0.3136, "step": 4779 }, { "epoch": 434.58181818181816, "grad_norm": 0.3118382692337036, "learning_rate": 2.2e-06, "loss": 0.3171, "step": 4780 }, { "epoch": 434.58181818181816, "eval_loss": 0.42379724979400635, "eval_runtime": 2.1206, "eval_samples_per_second": 25.937, "eval_steps_per_second": 3.301, "step": 4780 }, { "epoch": 434.6787878787879, "grad_norm": 0.29958993196487427, "learning_rate": 2.19e-06, "loss": 0.3275, "step": 4781 }, { "epoch": 434.77575757575755, "grad_norm": 0.24797827005386353, "learning_rate": 2.1800000000000003e-06, "loss": 0.303, "step": 4782 }, { "epoch": 434.8727272727273, "grad_norm": 0.2977842688560486, "learning_rate": 2.17e-06, "loss": 0.3703, "step": 4783 }, { "epoch": 434.969696969697, "grad_norm": 0.26860713958740234, "learning_rate": 2.16e-06, "loss": 0.335, "step": 4784 }, { "epoch": 435.0, "grad_norm": 0.47584444284439087, "learning_rate": 2.1499999999999997e-06, "loss": 0.2958, "step": 4785 }, { "epoch": 435.0969696969697, "grad_norm": 0.2772142291069031, "learning_rate": 2.14e-06, "loss": 0.3721, "step": 4786 }, { "epoch": 435.1939393939394, "grad_norm": 0.294428288936615, "learning_rate": 2.13e-06, "loss": 0.3096, "step": 4787 }, { "epoch": 435.2909090909091, "grad_norm": 0.3144077956676483, "learning_rate": 2.12e-06, "loss": 0.3143, "step": 4788 }, { "epoch": 435.3878787878788, "grad_norm": 0.2641150951385498, "learning_rate": 2.11e-06, "loss": 0.2965, "step": 4789 }, { "epoch": 435.4848484848485, "grad_norm": 0.2828153371810913, "learning_rate": 2.1000000000000002e-06, "loss": 0.309, "step": 4790 }, { "epoch": 435.4848484848485, "eval_loss": 0.4236714243888855, "eval_runtime": 2.1076, "eval_samples_per_second": 26.096, "eval_steps_per_second": 3.321, "step": 4790 }, { "epoch": 435.58181818181816, "grad_norm": 0.29111000895500183, "learning_rate": 2.09e-06, "loss": 0.3343, "step": 4791 }, { "epoch": 435.6787878787879, "grad_norm": 0.32535022497177124, "learning_rate": 2.08e-06, "loss": 0.3533, "step": 4792 }, { "epoch": 435.77575757575755, "grad_norm": 0.25760555267333984, "learning_rate": 2.07e-06, "loss": 0.3249, "step": 4793 }, { "epoch": 435.8727272727273, "grad_norm": 0.26453596353530884, "learning_rate": 2.06e-06, "loss": 0.346, "step": 4794 }, { "epoch": 435.969696969697, "grad_norm": 0.29465922713279724, "learning_rate": 2.0500000000000003e-06, "loss": 0.2883, "step": 4795 }, { "epoch": 436.0, "grad_norm": 0.5762606263160706, "learning_rate": 2.0400000000000004e-06, "loss": 0.3735, "step": 4796 }, { "epoch": 436.0969696969697, "grad_norm": 0.2903105914592743, "learning_rate": 2.03e-06, "loss": 0.3105, "step": 4797 }, { "epoch": 436.1939393939394, "grad_norm": 0.2878726124763489, "learning_rate": 2.02e-06, "loss": 0.3584, "step": 4798 }, { "epoch": 436.2909090909091, "grad_norm": 0.2933520972728729, "learning_rate": 2.0100000000000002e-06, "loss": 0.3282, "step": 4799 }, { "epoch": 436.3878787878788, "grad_norm": 0.270224928855896, "learning_rate": 2.0000000000000003e-06, "loss": 0.3184, "step": 4800 }, { "epoch": 436.3878787878788, "eval_loss": 0.423602432012558, "eval_runtime": 2.0973, "eval_samples_per_second": 26.225, "eval_steps_per_second": 3.338, "step": 4800 }, { "epoch": 436.4848484848485, "grad_norm": 0.272318035364151, "learning_rate": 1.99e-06, "loss": 0.3204, "step": 4801 }, { "epoch": 436.58181818181816, "grad_norm": 0.30114665627479553, "learning_rate": 1.98e-06, "loss": 0.3378, "step": 4802 }, { "epoch": 436.6787878787879, "grad_norm": 0.28273192048072815, "learning_rate": 1.9699999999999998e-06, "loss": 0.3617, "step": 4803 }, { "epoch": 436.77575757575755, "grad_norm": 0.31538066267967224, "learning_rate": 1.96e-06, "loss": 0.3142, "step": 4804 }, { "epoch": 436.8727272727273, "grad_norm": 0.37554872035980225, "learning_rate": 1.95e-06, "loss": 0.3327, "step": 4805 }, { "epoch": 436.969696969697, "grad_norm": 0.30792397260665894, "learning_rate": 1.94e-06, "loss": 0.3026, "step": 4806 }, { "epoch": 437.0, "grad_norm": 0.5518594980239868, "learning_rate": 1.93e-06, "loss": 0.2628, "step": 4807 }, { "epoch": 437.0969696969697, "grad_norm": 0.3231973648071289, "learning_rate": 1.92e-06, "loss": 0.2984, "step": 4808 }, { "epoch": 437.1939393939394, "grad_norm": 0.2513819932937622, "learning_rate": 1.91e-06, "loss": 0.3173, "step": 4809 }, { "epoch": 437.2909090909091, "grad_norm": 0.275855153799057, "learning_rate": 1.9e-06, "loss": 0.3195, "step": 4810 }, { "epoch": 437.2909090909091, "eval_loss": 0.42367324233055115, "eval_runtime": 2.111, "eval_samples_per_second": 26.054, "eval_steps_per_second": 3.316, "step": 4810 }, { "epoch": 437.3878787878788, "grad_norm": 0.25522032380104065, "learning_rate": 1.8900000000000001e-06, "loss": 0.3273, "step": 4811 }, { "epoch": 437.4848484848485, "grad_norm": 0.29282745718955994, "learning_rate": 1.8800000000000002e-06, "loss": 0.3043, "step": 4812 }, { "epoch": 437.58181818181816, "grad_norm": 0.28065821528434753, "learning_rate": 1.8700000000000003e-06, "loss": 0.3699, "step": 4813 }, { "epoch": 437.6787878787879, "grad_norm": 0.27441632747650146, "learning_rate": 1.86e-06, "loss": 0.328, "step": 4814 }, { "epoch": 437.77575757575755, "grad_norm": 0.3121632933616638, "learning_rate": 1.85e-06, "loss": 0.3467, "step": 4815 }, { "epoch": 437.8727272727273, "grad_norm": 0.3079357147216797, "learning_rate": 1.84e-06, "loss": 0.3555, "step": 4816 }, { "epoch": 437.969696969697, "grad_norm": 0.2577809989452362, "learning_rate": 1.83e-06, "loss": 0.3061, "step": 4817 }, { "epoch": 438.0, "grad_norm": 0.45075249671936035, "learning_rate": 1.8200000000000002e-06, "loss": 0.2846, "step": 4818 }, { "epoch": 438.0969696969697, "grad_norm": 0.37741875648498535, "learning_rate": 1.8100000000000002e-06, "loss": 0.3472, "step": 4819 }, { "epoch": 438.1939393939394, "grad_norm": 0.3882526755332947, "learning_rate": 1.8e-06, "loss": 0.3513, "step": 4820 }, { "epoch": 438.1939393939394, "eval_loss": 0.4238298237323761, "eval_runtime": 2.1023, "eval_samples_per_second": 26.162, "eval_steps_per_second": 3.33, "step": 4820 }, { "epoch": 438.2909090909091, "grad_norm": 0.318301260471344, "learning_rate": 1.79e-06, "loss": 0.3422, "step": 4821 }, { "epoch": 438.3878787878788, "grad_norm": 0.2677038013935089, "learning_rate": 1.7800000000000001e-06, "loss": 0.3271, "step": 4822 }, { "epoch": 438.4848484848485, "grad_norm": 0.2726380228996277, "learning_rate": 1.7700000000000002e-06, "loss": 0.3278, "step": 4823 }, { "epoch": 438.58181818181816, "grad_norm": 0.2821105420589447, "learning_rate": 1.76e-06, "loss": 0.2994, "step": 4824 }, { "epoch": 438.6787878787879, "grad_norm": 0.3339035212993622, "learning_rate": 1.7500000000000002e-06, "loss": 0.2954, "step": 4825 }, { "epoch": 438.77575757575755, "grad_norm": 0.2941056489944458, "learning_rate": 1.7399999999999999e-06, "loss": 0.3239, "step": 4826 }, { "epoch": 438.8727272727273, "grad_norm": 0.2761934995651245, "learning_rate": 1.73e-06, "loss": 0.3361, "step": 4827 }, { "epoch": 438.969696969697, "grad_norm": 0.26881179213523865, "learning_rate": 1.72e-06, "loss": 0.321, "step": 4828 }, { "epoch": 439.0, "grad_norm": 0.5755116939544678, "learning_rate": 1.7100000000000001e-06, "loss": 0.3036, "step": 4829 }, { "epoch": 439.0969696969697, "grad_norm": 0.29740414023399353, "learning_rate": 1.7000000000000002e-06, "loss": 0.307, "step": 4830 }, { "epoch": 439.0969696969697, "eval_loss": 0.4238336682319641, "eval_runtime": 2.1193, "eval_samples_per_second": 25.952, "eval_steps_per_second": 3.303, "step": 4830 }, { "epoch": 439.1939393939394, "grad_norm": 0.346254825592041, "learning_rate": 1.69e-06, "loss": 0.3137, "step": 4831 }, { "epoch": 439.2909090909091, "grad_norm": 0.2799615263938904, "learning_rate": 1.68e-06, "loss": 0.3146, "step": 4832 }, { "epoch": 439.3878787878788, "grad_norm": 0.269903302192688, "learning_rate": 1.67e-06, "loss": 0.3039, "step": 4833 }, { "epoch": 439.4848484848485, "grad_norm": 0.2790166735649109, "learning_rate": 1.6600000000000002e-06, "loss": 0.3179, "step": 4834 }, { "epoch": 439.58181818181816, "grad_norm": 0.3139094114303589, "learning_rate": 1.65e-06, "loss": 0.3358, "step": 4835 }, { "epoch": 439.6787878787879, "grad_norm": 0.29021573066711426, "learning_rate": 1.6400000000000002e-06, "loss": 0.3409, "step": 4836 }, { "epoch": 439.77575757575755, "grad_norm": 0.3001808822154999, "learning_rate": 1.6299999999999999e-06, "loss": 0.3347, "step": 4837 }, { "epoch": 439.8727272727273, "grad_norm": 0.2734721601009369, "learning_rate": 1.62e-06, "loss": 0.3314, "step": 4838 }, { "epoch": 439.969696969697, "grad_norm": 0.27748993039131165, "learning_rate": 1.61e-06, "loss": 0.3715, "step": 4839 }, { "epoch": 440.0, "grad_norm": 0.4819665849208832, "learning_rate": 1.6000000000000001e-06, "loss": 0.3023, "step": 4840 }, { "epoch": 440.0, "eval_loss": 0.42376306653022766, "eval_runtime": 2.109, "eval_samples_per_second": 26.079, "eval_steps_per_second": 3.319, "step": 4840 }, { "epoch": 440.0969696969697, "grad_norm": 0.31397852301597595, "learning_rate": 1.5900000000000002e-06, "loss": 0.3496, "step": 4841 }, { "epoch": 440.1939393939394, "grad_norm": 0.3182499408721924, "learning_rate": 1.5800000000000003e-06, "loss": 0.3132, "step": 4842 }, { "epoch": 440.2909090909091, "grad_norm": 0.29344242811203003, "learning_rate": 1.57e-06, "loss": 0.2864, "step": 4843 }, { "epoch": 440.3878787878788, "grad_norm": 0.27054810523986816, "learning_rate": 1.56e-06, "loss": 0.338, "step": 4844 }, { "epoch": 440.4848484848485, "grad_norm": 0.2715929448604584, "learning_rate": 1.55e-06, "loss": 0.3441, "step": 4845 }, { "epoch": 440.58181818181816, "grad_norm": 0.27300938963890076, "learning_rate": 1.54e-06, "loss": 0.3072, "step": 4846 }, { "epoch": 440.6787878787879, "grad_norm": 0.2786642611026764, "learning_rate": 1.53e-06, "loss": 0.313, "step": 4847 }, { "epoch": 440.77575757575755, "grad_norm": 0.269209086894989, "learning_rate": 1.52e-06, "loss": 0.3338, "step": 4848 }, { "epoch": 440.8727272727273, "grad_norm": 0.26223093271255493, "learning_rate": 1.5100000000000002e-06, "loss": 0.3319, "step": 4849 }, { "epoch": 440.969696969697, "grad_norm": 0.28911757469177246, "learning_rate": 1.5e-06, "loss": 0.3457, "step": 4850 }, { "epoch": 440.969696969697, "eval_loss": 0.4236938953399658, "eval_runtime": 2.1201, "eval_samples_per_second": 25.943, "eval_steps_per_second": 3.302, "step": 4850 }, { "epoch": 441.0, "grad_norm": 0.5196654796600342, "learning_rate": 1.4900000000000001e-06, "loss": 0.3324, "step": 4851 }, { "epoch": 441.0969696969697, "grad_norm": 0.2900083363056183, "learning_rate": 1.4800000000000002e-06, "loss": 0.3761, "step": 4852 }, { "epoch": 441.1939393939394, "grad_norm": 0.2578149735927582, "learning_rate": 1.4700000000000001e-06, "loss": 0.3005, "step": 4853 }, { "epoch": 441.2909090909091, "grad_norm": 0.27350473403930664, "learning_rate": 1.46e-06, "loss": 0.3364, "step": 4854 }, { "epoch": 441.3878787878788, "grad_norm": 0.28097885847091675, "learning_rate": 1.45e-06, "loss": 0.3353, "step": 4855 }, { "epoch": 441.4848484848485, "grad_norm": 0.3071709871292114, "learning_rate": 1.44e-06, "loss": 0.3351, "step": 4856 }, { "epoch": 441.58181818181816, "grad_norm": 0.382062703371048, "learning_rate": 1.43e-06, "loss": 0.3114, "step": 4857 }, { "epoch": 441.6787878787879, "grad_norm": 0.27680331468582153, "learning_rate": 1.4200000000000002e-06, "loss": 0.3436, "step": 4858 }, { "epoch": 441.77575757575755, "grad_norm": 0.2957818806171417, "learning_rate": 1.41e-06, "loss": 0.3129, "step": 4859 }, { "epoch": 441.8727272727273, "grad_norm": 0.2488706111907959, "learning_rate": 1.4000000000000001e-06, "loss": 0.3263, "step": 4860 }, { "epoch": 441.8727272727273, "eval_loss": 0.4236072301864624, "eval_runtime": 2.1291, "eval_samples_per_second": 25.832, "eval_steps_per_second": 3.288, "step": 4860 }, { "epoch": 441.969696969697, "grad_norm": 0.29431694746017456, "learning_rate": 1.39e-06, "loss": 0.2953, "step": 4861 }, { "epoch": 442.0, "grad_norm": 0.5485882759094238, "learning_rate": 1.3800000000000001e-06, "loss": 0.2916, "step": 4862 }, { "epoch": 442.0969696969697, "grad_norm": 0.2963046729564667, "learning_rate": 1.37e-06, "loss": 0.301, "step": 4863 }, { "epoch": 442.1939393939394, "grad_norm": 0.27449437975883484, "learning_rate": 1.36e-06, "loss": 0.2883, "step": 4864 }, { "epoch": 442.2909090909091, "grad_norm": 0.25899559259414673, "learning_rate": 1.35e-06, "loss": 0.3357, "step": 4865 }, { "epoch": 442.3878787878788, "grad_norm": 0.29228493571281433, "learning_rate": 1.34e-06, "loss": 0.3331, "step": 4866 }, { "epoch": 442.4848484848485, "grad_norm": 0.2560674250125885, "learning_rate": 1.33e-06, "loss": 0.3252, "step": 4867 }, { "epoch": 442.58181818181816, "grad_norm": 0.2929626405239105, "learning_rate": 1.32e-06, "loss": 0.3488, "step": 4868 }, { "epoch": 442.6787878787879, "grad_norm": 0.26246029138565063, "learning_rate": 1.3100000000000002e-06, "loss": 0.3306, "step": 4869 }, { "epoch": 442.77575757575755, "grad_norm": 0.29994064569473267, "learning_rate": 1.3e-06, "loss": 0.3437, "step": 4870 }, { "epoch": 442.77575757575755, "eval_loss": 0.42368239164352417, "eval_runtime": 2.1083, "eval_samples_per_second": 26.087, "eval_steps_per_second": 3.32, "step": 4870 }, { "epoch": 442.8727272727273, "grad_norm": 0.336182564496994, "learning_rate": 1.2900000000000001e-06, "loss": 0.3325, "step": 4871 }, { "epoch": 442.969696969697, "grad_norm": 0.2670702636241913, "learning_rate": 1.28e-06, "loss": 0.3243, "step": 4872 }, { "epoch": 443.0, "grad_norm": 0.4987045228481293, "learning_rate": 1.27e-06, "loss": 0.3315, "step": 4873 }, { "epoch": 443.0969696969697, "grad_norm": 0.45853880047798157, "learning_rate": 1.26e-06, "loss": 0.2985, "step": 4874 }, { "epoch": 443.1939393939394, "grad_norm": 0.2865673899650574, "learning_rate": 1.25e-06, "loss": 0.3484, "step": 4875 }, { "epoch": 443.2909090909091, "grad_norm": 0.2569712698459625, "learning_rate": 1.24e-06, "loss": 0.3291, "step": 4876 }, { "epoch": 443.3878787878788, "grad_norm": 0.24614472687244415, "learning_rate": 1.23e-06, "loss": 0.336, "step": 4877 }, { "epoch": 443.4848484848485, "grad_norm": 0.2899981439113617, "learning_rate": 1.2200000000000002e-06, "loss": 0.3269, "step": 4878 }, { "epoch": 443.58181818181816, "grad_norm": 0.28798767924308777, "learning_rate": 1.21e-06, "loss": 0.351, "step": 4879 }, { "epoch": 443.6787878787879, "grad_norm": 0.2734663784503937, "learning_rate": 1.2000000000000002e-06, "loss": 0.3152, "step": 4880 }, { "epoch": 443.6787878787879, "eval_loss": 0.423577219247818, "eval_runtime": 2.0931, "eval_samples_per_second": 26.277, "eval_steps_per_second": 3.344, "step": 4880 }, { "epoch": 443.77575757575755, "grad_norm": 0.26212504506111145, "learning_rate": 1.19e-06, "loss": 0.2981, "step": 4881 }, { "epoch": 443.8727272727273, "grad_norm": 0.2801170349121094, "learning_rate": 1.18e-06, "loss": 0.3277, "step": 4882 }, { "epoch": 443.969696969697, "grad_norm": 0.2888226807117462, "learning_rate": 1.17e-06, "loss": 0.3287, "step": 4883 }, { "epoch": 444.0, "grad_norm": 0.6596548557281494, "learning_rate": 1.16e-06, "loss": 0.3443, "step": 4884 }, { "epoch": 444.0969696969697, "grad_norm": 0.2544076442718506, "learning_rate": 1.15e-06, "loss": 0.3149, "step": 4885 }, { "epoch": 444.1939393939394, "grad_norm": 0.3435005247592926, "learning_rate": 1.14e-06, "loss": 0.329, "step": 4886 }, { "epoch": 444.2909090909091, "grad_norm": 0.31201043725013733, "learning_rate": 1.13e-06, "loss": 0.3529, "step": 4887 }, { "epoch": 444.3878787878788, "grad_norm": 0.27917465567588806, "learning_rate": 1.12e-06, "loss": 0.3457, "step": 4888 }, { "epoch": 444.4848484848485, "grad_norm": 0.3057820498943329, "learning_rate": 1.1100000000000002e-06, "loss": 0.3285, "step": 4889 }, { "epoch": 444.58181818181816, "grad_norm": 0.26686015725135803, "learning_rate": 1.1e-06, "loss": 0.3251, "step": 4890 }, { "epoch": 444.58181818181816, "eval_loss": 0.4236905872821808, "eval_runtime": 2.0952, "eval_samples_per_second": 26.251, "eval_steps_per_second": 3.341, "step": 4890 }, { "epoch": 444.6787878787879, "grad_norm": 0.28641071915626526, "learning_rate": 1.0900000000000002e-06, "loss": 0.3135, "step": 4891 }, { "epoch": 444.77575757575755, "grad_norm": 0.2760150730609894, "learning_rate": 1.08e-06, "loss": 0.3069, "step": 4892 }, { "epoch": 444.8727272727273, "grad_norm": 0.266857773065567, "learning_rate": 1.07e-06, "loss": 0.346, "step": 4893 }, { "epoch": 444.969696969697, "grad_norm": 0.32249146699905396, "learning_rate": 1.06e-06, "loss": 0.317, "step": 4894 }, { "epoch": 445.0, "grad_norm": 0.41339409351348877, "learning_rate": 1.0500000000000001e-06, "loss": 0.2677, "step": 4895 }, { "epoch": 445.0969696969697, "grad_norm": 0.27063751220703125, "learning_rate": 1.04e-06, "loss": 0.3249, "step": 4896 }, { "epoch": 445.1939393939394, "grad_norm": 0.2708544135093689, "learning_rate": 1.03e-06, "loss": 0.3461, "step": 4897 }, { "epoch": 445.2909090909091, "grad_norm": 0.3347277343273163, "learning_rate": 1.0200000000000002e-06, "loss": 0.3332, "step": 4898 }, { "epoch": 445.3878787878788, "grad_norm": 0.282702773809433, "learning_rate": 1.01e-06, "loss": 0.3284, "step": 4899 }, { "epoch": 445.4848484848485, "grad_norm": 0.30617809295654297, "learning_rate": 1.0000000000000002e-06, "loss": 0.3039, "step": 4900 }, { "epoch": 445.4848484848485, "eval_loss": 0.4235691726207733, "eval_runtime": 2.1044, "eval_samples_per_second": 26.136, "eval_steps_per_second": 3.326, "step": 4900 }, { "epoch": 445.58181818181816, "grad_norm": 0.3247913122177124, "learning_rate": 9.9e-07, "loss": 0.3181, "step": 4901 }, { "epoch": 445.6787878787879, "grad_norm": 0.30320414900779724, "learning_rate": 9.8e-07, "loss": 0.3312, "step": 4902 }, { "epoch": 445.77575757575755, "grad_norm": 0.27612176537513733, "learning_rate": 9.7e-07, "loss": 0.3375, "step": 4903 }, { "epoch": 445.8727272727273, "grad_norm": 0.31707799434661865, "learning_rate": 9.6e-07, "loss": 0.3347, "step": 4904 }, { "epoch": 445.969696969697, "grad_norm": 0.28800132870674133, "learning_rate": 9.5e-07, "loss": 0.3175, "step": 4905 }, { "epoch": 446.0, "grad_norm": 0.6043686866760254, "learning_rate": 9.400000000000001e-07, "loss": 0.2893, "step": 4906 }, { "epoch": 446.0969696969697, "grad_norm": 0.2963293194770813, "learning_rate": 9.3e-07, "loss": 0.317, "step": 4907 }, { "epoch": 446.1939393939394, "grad_norm": 0.30245649814605713, "learning_rate": 9.2e-07, "loss": 0.3513, "step": 4908 }, { "epoch": 446.2909090909091, "grad_norm": 0.25606197118759155, "learning_rate": 9.100000000000001e-07, "loss": 0.3136, "step": 4909 }, { "epoch": 446.3878787878788, "grad_norm": 0.3276771903038025, "learning_rate": 9e-07, "loss": 0.3119, "step": 4910 }, { "epoch": 446.3878787878788, "eval_loss": 0.42358332872390747, "eval_runtime": 2.1075, "eval_samples_per_second": 26.098, "eval_steps_per_second": 3.322, "step": 4910 }, { "epoch": 446.4848484848485, "grad_norm": 0.28884825110435486, "learning_rate": 8.900000000000001e-07, "loss": 0.3349, "step": 4911 }, { "epoch": 446.58181818181816, "grad_norm": 0.29931309819221497, "learning_rate": 8.8e-07, "loss": 0.3571, "step": 4912 }, { "epoch": 446.6787878787879, "grad_norm": 0.30855095386505127, "learning_rate": 8.699999999999999e-07, "loss": 0.2973, "step": 4913 }, { "epoch": 446.77575757575755, "grad_norm": 0.3217792510986328, "learning_rate": 8.6e-07, "loss": 0.3319, "step": 4914 }, { "epoch": 446.8727272727273, "grad_norm": 0.2590404748916626, "learning_rate": 8.500000000000001e-07, "loss": 0.3124, "step": 4915 }, { "epoch": 446.969696969697, "grad_norm": 0.30771511793136597, "learning_rate": 8.4e-07, "loss": 0.3506, "step": 4916 }, { "epoch": 447.0, "grad_norm": 0.5211871862411499, "learning_rate": 8.300000000000001e-07, "loss": 0.2776, "step": 4917 }, { "epoch": 447.0969696969697, "grad_norm": 0.27772092819213867, "learning_rate": 8.200000000000001e-07, "loss": 0.3135, "step": 4918 }, { "epoch": 447.1939393939394, "grad_norm": 0.29392674565315247, "learning_rate": 8.1e-07, "loss": 0.3452, "step": 4919 }, { "epoch": 447.2909090909091, "grad_norm": 0.2605448365211487, "learning_rate": 8.000000000000001e-07, "loss": 0.3271, "step": 4920 }, { "epoch": 447.2909090909091, "eval_loss": 0.4235498607158661, "eval_runtime": 2.1047, "eval_samples_per_second": 26.132, "eval_steps_per_second": 3.326, "step": 4920 }, { "epoch": 447.3878787878788, "grad_norm": 0.29172196984291077, "learning_rate": 7.900000000000002e-07, "loss": 0.3317, "step": 4921 }, { "epoch": 447.4848484848485, "grad_norm": 0.28964757919311523, "learning_rate": 7.8e-07, "loss": 0.3499, "step": 4922 }, { "epoch": 447.58181818181816, "grad_norm": 0.27365830540657043, "learning_rate": 7.7e-07, "loss": 0.3366, "step": 4923 }, { "epoch": 447.6787878787879, "grad_norm": 0.25957080721855164, "learning_rate": 7.6e-07, "loss": 0.3317, "step": 4924 }, { "epoch": 447.77575757575755, "grad_norm": 0.26374414563179016, "learning_rate": 7.5e-07, "loss": 0.3006, "step": 4925 }, { "epoch": 447.8727272727273, "grad_norm": 0.31097298860549927, "learning_rate": 7.400000000000001e-07, "loss": 0.3119, "step": 4926 }, { "epoch": 447.969696969697, "grad_norm": 0.2407606989145279, "learning_rate": 7.3e-07, "loss": 0.3228, "step": 4927 }, { "epoch": 448.0, "grad_norm": 0.6657671332359314, "learning_rate": 7.2e-07, "loss": 0.3023, "step": 4928 }, { "epoch": 448.0969696969697, "grad_norm": 0.3061230778694153, "learning_rate": 7.100000000000001e-07, "loss": 0.3364, "step": 4929 }, { "epoch": 448.1939393939394, "grad_norm": 0.31951260566711426, "learning_rate": 7.000000000000001e-07, "loss": 0.3437, "step": 4930 }, { "epoch": 448.1939393939394, "eval_loss": 0.4234934151172638, "eval_runtime": 2.1175, "eval_samples_per_second": 25.975, "eval_steps_per_second": 3.306, "step": 4930 }, { "epoch": 448.2909090909091, "grad_norm": 0.26566335558891296, "learning_rate": 6.900000000000001e-07, "loss": 0.3125, "step": 4931 }, { "epoch": 448.3878787878788, "grad_norm": 0.26490557193756104, "learning_rate": 6.8e-07, "loss": 0.3289, "step": 4932 }, { "epoch": 448.4848484848485, "grad_norm": 0.3251090943813324, "learning_rate": 6.7e-07, "loss": 0.3069, "step": 4933 }, { "epoch": 448.58181818181816, "grad_norm": 0.28914380073547363, "learning_rate": 6.6e-07, "loss": 0.3205, "step": 4934 }, { "epoch": 448.6787878787879, "grad_norm": 0.30359625816345215, "learning_rate": 6.5e-07, "loss": 0.3374, "step": 4935 }, { "epoch": 448.77575757575755, "grad_norm": 0.2604947090148926, "learning_rate": 6.4e-07, "loss": 0.3215, "step": 4936 }, { "epoch": 448.8727272727273, "grad_norm": 0.2874707281589508, "learning_rate": 6.3e-07, "loss": 0.3247, "step": 4937 }, { "epoch": 448.969696969697, "grad_norm": 0.32386428117752075, "learning_rate": 6.2e-07, "loss": 0.3289, "step": 4938 }, { "epoch": 449.0, "grad_norm": 0.5075844526290894, "learning_rate": 6.100000000000001e-07, "loss": 0.3356, "step": 4939 }, { "epoch": 449.0969696969697, "grad_norm": 0.30247360467910767, "learning_rate": 6.000000000000001e-07, "loss": 0.3472, "step": 4940 }, { "epoch": 449.0969696969697, "eval_loss": 0.42349129915237427, "eval_runtime": 2.0915, "eval_samples_per_second": 26.297, "eval_steps_per_second": 3.347, "step": 4940 }, { "epoch": 449.1939393939394, "grad_norm": 0.2983882427215576, "learning_rate": 5.9e-07, "loss": 0.3281, "step": 4941 }, { "epoch": 449.2909090909091, "grad_norm": 0.2906897962093353, "learning_rate": 5.8e-07, "loss": 0.3541, "step": 4942 }, { "epoch": 449.3878787878788, "grad_norm": 0.2708105146884918, "learning_rate": 5.7e-07, "loss": 0.3005, "step": 4943 }, { "epoch": 449.4848484848485, "grad_norm": 0.2866322696208954, "learning_rate": 5.6e-07, "loss": 0.3287, "step": 4944 }, { "epoch": 449.58181818181816, "grad_norm": 0.2833450734615326, "learning_rate": 5.5e-07, "loss": 0.328, "step": 4945 }, { "epoch": 449.6787878787879, "grad_norm": 0.2915904223918915, "learning_rate": 5.4e-07, "loss": 0.3245, "step": 4946 }, { "epoch": 449.77575757575755, "grad_norm": 0.24670271575450897, "learning_rate": 5.3e-07, "loss": 0.33, "step": 4947 }, { "epoch": 449.8727272727273, "grad_norm": 0.31395289301872253, "learning_rate": 5.2e-07, "loss": 0.3198, "step": 4948 }, { "epoch": 449.969696969697, "grad_norm": 0.27524620294570923, "learning_rate": 5.100000000000001e-07, "loss": 0.3062, "step": 4949 }, { "epoch": 450.0, "grad_norm": 0.4141598045825958, "learning_rate": 5.000000000000001e-07, "loss": 0.3101, "step": 4950 }, { "epoch": 450.0, "eval_loss": 0.4236139953136444, "eval_runtime": 2.1008, "eval_samples_per_second": 26.181, "eval_steps_per_second": 3.332, "step": 4950 }, { "epoch": 450.0969696969697, "grad_norm": 0.26350322365760803, "learning_rate": 4.9e-07, "loss": 0.3295, "step": 4951 }, { "epoch": 450.1939393939394, "grad_norm": 0.243812695145607, "learning_rate": 4.8e-07, "loss": 0.2898, "step": 4952 }, { "epoch": 450.2909090909091, "grad_norm": 0.2724280059337616, "learning_rate": 4.7000000000000005e-07, "loss": 0.3327, "step": 4953 }, { "epoch": 450.3878787878788, "grad_norm": 0.27506184577941895, "learning_rate": 4.6e-07, "loss": 0.3174, "step": 4954 }, { "epoch": 450.4848484848485, "grad_norm": 0.2802964448928833, "learning_rate": 4.5e-07, "loss": 0.3159, "step": 4955 }, { "epoch": 450.58181818181816, "grad_norm": 0.29718518257141113, "learning_rate": 4.4e-07, "loss": 0.329, "step": 4956 }, { "epoch": 450.6787878787879, "grad_norm": 0.28943824768066406, "learning_rate": 4.3e-07, "loss": 0.3397, "step": 4957 }, { "epoch": 450.77575757575755, "grad_norm": 0.2653546631336212, "learning_rate": 4.2e-07, "loss": 0.3435, "step": 4958 }, { "epoch": 450.8727272727273, "grad_norm": 0.25897830724716187, "learning_rate": 4.1000000000000004e-07, "loss": 0.3195, "step": 4959 }, { "epoch": 450.969696969697, "grad_norm": 0.3242824077606201, "learning_rate": 4.0000000000000003e-07, "loss": 0.3304, "step": 4960 }, { "epoch": 450.969696969697, "eval_loss": 0.42363691329956055, "eval_runtime": 2.1169, "eval_samples_per_second": 25.981, "eval_steps_per_second": 3.307, "step": 4960 }, { "epoch": 451.0, "grad_norm": 0.5916183590888977, "learning_rate": 3.9e-07, "loss": 0.3718, "step": 4961 }, { "epoch": 451.0969696969697, "grad_norm": 0.3053007423877716, "learning_rate": 3.8e-07, "loss": 0.311, "step": 4962 }, { "epoch": 451.1939393939394, "grad_norm": 0.26286765933036804, "learning_rate": 3.7000000000000006e-07, "loss": 0.3738, "step": 4963 }, { "epoch": 451.2909090909091, "grad_norm": 0.29149967432022095, "learning_rate": 3.6e-07, "loss": 0.3599, "step": 4964 }, { "epoch": 451.3878787878788, "grad_norm": 0.32165083289146423, "learning_rate": 3.5000000000000004e-07, "loss": 0.334, "step": 4965 }, { "epoch": 451.4848484848485, "grad_norm": 0.26468729972839355, "learning_rate": 3.4e-07, "loss": 0.3138, "step": 4966 }, { "epoch": 451.58181818181816, "grad_norm": 0.2730655372142792, "learning_rate": 3.3e-07, "loss": 0.2947, "step": 4967 }, { "epoch": 451.6787878787879, "grad_norm": 0.286457896232605, "learning_rate": 3.2e-07, "loss": 0.2962, "step": 4968 }, { "epoch": 451.77575757575755, "grad_norm": 0.28551334142684937, "learning_rate": 3.1e-07, "loss": 0.3417, "step": 4969 }, { "epoch": 451.8727272727273, "grad_norm": 0.29090040922164917, "learning_rate": 3.0000000000000004e-07, "loss": 0.3109, "step": 4970 }, { "epoch": 451.8727272727273, "eval_loss": 0.4235963225364685, "eval_runtime": 2.1061, "eval_samples_per_second": 26.115, "eval_steps_per_second": 3.324, "step": 4970 }, { "epoch": 451.969696969697, "grad_norm": 0.3137418031692505, "learning_rate": 2.9e-07, "loss": 0.3263, "step": 4971 }, { "epoch": 452.0, "grad_norm": 0.5446314811706543, "learning_rate": 2.8e-07, "loss": 0.3331, "step": 4972 }, { "epoch": 452.0969696969697, "grad_norm": 0.2853860855102539, "learning_rate": 2.7e-07, "loss": 0.3425, "step": 4973 }, { "epoch": 452.1939393939394, "grad_norm": 0.24901294708251953, "learning_rate": 2.6e-07, "loss": 0.33, "step": 4974 }, { "epoch": 452.2909090909091, "grad_norm": 0.2792257070541382, "learning_rate": 2.5000000000000004e-07, "loss": 0.3255, "step": 4975 }, { "epoch": 452.3878787878788, "grad_norm": 0.2828456461429596, "learning_rate": 2.4e-07, "loss": 0.2862, "step": 4976 }, { "epoch": 452.4848484848485, "grad_norm": 0.24984979629516602, "learning_rate": 2.3e-07, "loss": 0.2976, "step": 4977 }, { "epoch": 452.58181818181816, "grad_norm": 0.31003209948539734, "learning_rate": 2.2e-07, "loss": 0.3427, "step": 4978 }, { "epoch": 452.6787878787879, "grad_norm": 0.316670298576355, "learning_rate": 2.1e-07, "loss": 0.3252, "step": 4979 }, { "epoch": 452.77575757575755, "grad_norm": 0.3688763976097107, "learning_rate": 2.0000000000000002e-07, "loss": 0.3448, "step": 4980 }, { "epoch": 452.77575757575755, "eval_loss": 0.4235762655735016, "eval_runtime": 2.1016, "eval_samples_per_second": 26.17, "eval_steps_per_second": 3.331, "step": 4980 }, { "epoch": 452.8727272727273, "grad_norm": 0.29659217596054077, "learning_rate": 1.9e-07, "loss": 0.3238, "step": 4981 }, { "epoch": 452.969696969697, "grad_norm": 0.2912966310977936, "learning_rate": 1.8e-07, "loss": 0.3385, "step": 4982 }, { "epoch": 453.0, "grad_norm": 0.4333655536174774, "learning_rate": 1.7e-07, "loss": 0.3468, "step": 4983 }, { "epoch": 453.0969696969697, "grad_norm": 0.25823283195495605, "learning_rate": 1.6e-07, "loss": 0.2911, "step": 4984 }, { "epoch": 453.1939393939394, "grad_norm": 0.24722981452941895, "learning_rate": 1.5000000000000002e-07, "loss": 0.3286, "step": 4985 }, { "epoch": 453.2909090909091, "grad_norm": 0.2904161214828491, "learning_rate": 1.4e-07, "loss": 0.3107, "step": 4986 }, { "epoch": 453.3878787878788, "grad_norm": 0.2927333116531372, "learning_rate": 1.3e-07, "loss": 0.3286, "step": 4987 }, { "epoch": 453.4848484848485, "grad_norm": 0.26876407861709595, "learning_rate": 1.2e-07, "loss": 0.3369, "step": 4988 }, { "epoch": 453.58181818181816, "grad_norm": 0.30206063389778137, "learning_rate": 1.1e-07, "loss": 0.313, "step": 4989 }, { "epoch": 453.6787878787879, "grad_norm": 0.2813703119754791, "learning_rate": 1.0000000000000001e-07, "loss": 0.3211, "step": 4990 }, { "epoch": 453.6787878787879, "eval_loss": 0.42354801297187805, "eval_runtime": 2.0963, "eval_samples_per_second": 26.236, "eval_steps_per_second": 3.339, "step": 4990 }, { "epoch": 453.77575757575755, "grad_norm": 0.2632094621658325, "learning_rate": 9e-08, "loss": 0.3469, "step": 4991 }, { "epoch": 453.8727272727273, "grad_norm": 0.29159316420555115, "learning_rate": 8e-08, "loss": 0.3544, "step": 4992 }, { "epoch": 453.969696969697, "grad_norm": 0.2889498174190521, "learning_rate": 7e-08, "loss": 0.3087, "step": 4993 }, { "epoch": 454.0, "grad_norm": 0.5088276863098145, "learning_rate": 6e-08, "loss": 0.3929, "step": 4994 }, { "epoch": 454.0969696969697, "grad_norm": 0.29596179723739624, "learning_rate": 5.0000000000000004e-08, "loss": 0.3085, "step": 4995 }, { "epoch": 454.1939393939394, "grad_norm": 0.2667468190193176, "learning_rate": 4e-08, "loss": 0.3522, "step": 4996 }, { "epoch": 454.2909090909091, "grad_norm": 0.25791922211647034, "learning_rate": 3e-08, "loss": 0.3197, "step": 4997 }, { "epoch": 454.3878787878788, "grad_norm": 0.26287326216697693, "learning_rate": 2e-08, "loss": 0.3218, "step": 4998 }, { "epoch": 454.4848484848485, "grad_norm": 0.26132941246032715, "learning_rate": 1e-08, "loss": 0.2982, "step": 4999 }, { "epoch": 454.58181818181816, "grad_norm": 0.2780923545360565, "learning_rate": 0.0, "loss": 0.3526, "step": 5000 }, { "epoch": 454.58181818181816, "eval_loss": 0.4235467314720154, "eval_runtime": 2.0906, "eval_samples_per_second": 26.308, "eval_steps_per_second": 3.348, "step": 5000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 500, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.180595246465024e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }