diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5287 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 747, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004016064257028112, + "grad_norm": 6.783391952514648, + "learning_rate": 0.0002, + "loss": 17.0626, + "step": 1 + }, + { + "epoch": 0.008032128514056224, + "grad_norm": 6.832010269165039, + "learning_rate": 0.0001997322623828648, + "loss": 16.3736, + "step": 2 + }, + { + "epoch": 0.012048192771084338, + "grad_norm": 4.412657260894775, + "learning_rate": 0.0001994645247657296, + "loss": 13.7202, + "step": 3 + }, + { + "epoch": 0.01606425702811245, + "grad_norm": 4.6994500160217285, + "learning_rate": 0.0001991967871485944, + "loss": 12.1103, + "step": 4 + }, + { + "epoch": 0.020080321285140562, + "grad_norm": 5.078355312347412, + "learning_rate": 0.00019892904953145918, + "loss": 11.9491, + "step": 5 + }, + { + "epoch": 0.024096385542168676, + "grad_norm": 5.82587194442749, + "learning_rate": 0.00019866131191432397, + "loss": 10.24, + "step": 6 + }, + { + "epoch": 0.028112449799196786, + "grad_norm": 5.521396160125732, + "learning_rate": 0.00019839357429718877, + "loss": 9.7617, + "step": 7 + }, + { + "epoch": 0.0321285140562249, + "grad_norm": 5.55628776550293, + "learning_rate": 0.00019812583668005356, + "loss": 8.9588, + "step": 8 + }, + { + "epoch": 0.03614457831325301, + "grad_norm": 4.77673864364624, + "learning_rate": 0.00019785809906291835, + "loss": 7.413, + "step": 9 + }, + { + "epoch": 0.040160642570281124, + "grad_norm": 3.045475482940674, + "learning_rate": 0.00019759036144578314, + "loss": 8.4555, + "step": 10 + }, + { + "epoch": 0.04417670682730924, + "grad_norm": 2.4188013076782227, + "learning_rate": 0.0001973226238286479, + "loss": 6.3816, + "step": 11 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 2.483142852783203, + "learning_rate": 0.00019705488621151273, + "loss": 6.0486, + "step": 12 + }, + { + "epoch": 0.05220883534136546, + "grad_norm": 2.7488200664520264, + "learning_rate": 0.00019678714859437752, + "loss": 6.0559, + "step": 13 + }, + { + "epoch": 0.05622489959839357, + "grad_norm": 3.509127140045166, + "learning_rate": 0.00019651941097724232, + "loss": 6.5013, + "step": 14 + }, + { + "epoch": 0.060240963855421686, + "grad_norm": 4.097210884094238, + "learning_rate": 0.0001962516733601071, + "loss": 6.6959, + "step": 15 + }, + { + "epoch": 0.0642570281124498, + "grad_norm": 5.211580753326416, + "learning_rate": 0.0001959839357429719, + "loss": 7.4451, + "step": 16 + }, + { + "epoch": 0.06827309236947791, + "grad_norm": 4.360202312469482, + "learning_rate": 0.00019571619812583667, + "loss": 7.5475, + "step": 17 + }, + { + "epoch": 0.07228915662650602, + "grad_norm": 4.646812915802002, + "learning_rate": 0.0001954484605087015, + "loss": 5.9117, + "step": 18 + }, + { + "epoch": 0.07630522088353414, + "grad_norm": 4.076641082763672, + "learning_rate": 0.00019518072289156628, + "loss": 6.5152, + "step": 19 + }, + { + "epoch": 0.08032128514056225, + "grad_norm": 4.571013450622559, + "learning_rate": 0.00019491298527443107, + "loss": 7.7192, + "step": 20 + }, + { + "epoch": 0.08433734939759036, + "grad_norm": 3.786604881286621, + "learning_rate": 0.00019464524765729587, + "loss": 6.0262, + "step": 21 + }, + { + "epoch": 0.08835341365461848, + "grad_norm": 3.7632923126220703, + "learning_rate": 0.00019437751004016066, + "loss": 5.515, + "step": 22 + }, + { + "epoch": 0.09236947791164658, + "grad_norm": 3.142625093460083, + "learning_rate": 0.00019410977242302542, + "loss": 5.5428, + "step": 23 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 4.195131778717041, + "learning_rate": 0.00019384203480589022, + "loss": 5.0073, + "step": 24 + }, + { + "epoch": 0.10040160642570281, + "grad_norm": 7.452038764953613, + "learning_rate": 0.00019357429718875504, + "loss": 5.6765, + "step": 25 + }, + { + "epoch": 0.10441767068273092, + "grad_norm": 9.708063125610352, + "learning_rate": 0.00019330655957161983, + "loss": 5.6149, + "step": 26 + }, + { + "epoch": 0.10843373493975904, + "grad_norm": 19.072011947631836, + "learning_rate": 0.00019303882195448462, + "loss": 5.4365, + "step": 27 + }, + { + "epoch": 0.11244979919678715, + "grad_norm": 6.726373195648193, + "learning_rate": 0.00019277108433734942, + "loss": 4.858, + "step": 28 + }, + { + "epoch": 0.11646586345381527, + "grad_norm": 3.187056064605713, + "learning_rate": 0.0001925033467202142, + "loss": 5.3406, + "step": 29 + }, + { + "epoch": 0.12048192771084337, + "grad_norm": 3.364069700241089, + "learning_rate": 0.00019223560910307897, + "loss": 5.5143, + "step": 30 + }, + { + "epoch": 0.12449799196787148, + "grad_norm": 2.4620518684387207, + "learning_rate": 0.00019196787148594377, + "loss": 4.638, + "step": 31 + }, + { + "epoch": 0.1285140562248996, + "grad_norm": 3.9363696575164795, + "learning_rate": 0.0001917001338688086, + "loss": 4.6009, + "step": 32 + }, + { + "epoch": 0.13253012048192772, + "grad_norm": 3.230189561843872, + "learning_rate": 0.00019143239625167338, + "loss": 4.7928, + "step": 33 + }, + { + "epoch": 0.13654618473895583, + "grad_norm": 2.873898983001709, + "learning_rate": 0.00019116465863453817, + "loss": 3.7444, + "step": 34 + }, + { + "epoch": 0.14056224899598393, + "grad_norm": 3.2136387825012207, + "learning_rate": 0.00019089692101740297, + "loss": 4.452, + "step": 35 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 2.8411664962768555, + "learning_rate": 0.00019062918340026773, + "loss": 4.483, + "step": 36 + }, + { + "epoch": 0.14859437751004015, + "grad_norm": 2.68854022026062, + "learning_rate": 0.00019036144578313252, + "loss": 3.92, + "step": 37 + }, + { + "epoch": 0.15261044176706828, + "grad_norm": 3.324504852294922, + "learning_rate": 0.00019009370816599734, + "loss": 4.4238, + "step": 38 + }, + { + "epoch": 0.1566265060240964, + "grad_norm": 3.0757510662078857, + "learning_rate": 0.00018982597054886214, + "loss": 4.0354, + "step": 39 + }, + { + "epoch": 0.1606425702811245, + "grad_norm": 3.1478559970855713, + "learning_rate": 0.00018955823293172693, + "loss": 4.7587, + "step": 40 + }, + { + "epoch": 0.1646586345381526, + "grad_norm": 2.923387050628662, + "learning_rate": 0.00018929049531459172, + "loss": 4.1713, + "step": 41 + }, + { + "epoch": 0.1686746987951807, + "grad_norm": 3.3262710571289062, + "learning_rate": 0.0001890227576974565, + "loss": 5.7246, + "step": 42 + }, + { + "epoch": 0.17269076305220885, + "grad_norm": 2.9940414428710938, + "learning_rate": 0.00018875502008032128, + "loss": 3.9502, + "step": 43 + }, + { + "epoch": 0.17670682730923695, + "grad_norm": 2.4215221405029297, + "learning_rate": 0.00018848728246318607, + "loss": 3.3469, + "step": 44 + }, + { + "epoch": 0.18072289156626506, + "grad_norm": 4.08881139755249, + "learning_rate": 0.0001882195448460509, + "loss": 3.6203, + "step": 45 + }, + { + "epoch": 0.18473895582329317, + "grad_norm": 2.550448417663574, + "learning_rate": 0.00018795180722891569, + "loss": 3.9986, + "step": 46 + }, + { + "epoch": 0.18875502008032127, + "grad_norm": 2.3286774158477783, + "learning_rate": 0.00018768406961178048, + "loss": 3.3749, + "step": 47 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 2.724431276321411, + "learning_rate": 0.00018741633199464524, + "loss": 3.4734, + "step": 48 + }, + { + "epoch": 0.19678714859437751, + "grad_norm": 2.961087226867676, + "learning_rate": 0.00018714859437751004, + "loss": 4.242, + "step": 49 + }, + { + "epoch": 0.20080321285140562, + "grad_norm": 2.4245645999908447, + "learning_rate": 0.00018688085676037483, + "loss": 3.7956, + "step": 50 + }, + { + "epoch": 0.20481927710843373, + "grad_norm": 2.141226053237915, + "learning_rate": 0.00018661311914323962, + "loss": 3.0041, + "step": 51 + }, + { + "epoch": 0.20883534136546184, + "grad_norm": 2.7774155139923096, + "learning_rate": 0.00018634538152610444, + "loss": 3.5062, + "step": 52 + }, + { + "epoch": 0.21285140562248997, + "grad_norm": 2.6332597732543945, + "learning_rate": 0.00018607764390896924, + "loss": 3.9305, + "step": 53 + }, + { + "epoch": 0.21686746987951808, + "grad_norm": 3.4417197704315186, + "learning_rate": 0.000185809906291834, + "loss": 5.1481, + "step": 54 + }, + { + "epoch": 0.22088353413654618, + "grad_norm": 2.576704978942871, + "learning_rate": 0.0001855421686746988, + "loss": 3.6137, + "step": 55 + }, + { + "epoch": 0.2248995983935743, + "grad_norm": 2.816452980041504, + "learning_rate": 0.0001852744310575636, + "loss": 3.5015, + "step": 56 + }, + { + "epoch": 0.2289156626506024, + "grad_norm": 3.5300023555755615, + "learning_rate": 0.00018500669344042838, + "loss": 4.7758, + "step": 57 + }, + { + "epoch": 0.23293172690763053, + "grad_norm": 2.594787120819092, + "learning_rate": 0.0001847389558232932, + "loss": 4.0104, + "step": 58 + }, + { + "epoch": 0.23694779116465864, + "grad_norm": 3.472842216491699, + "learning_rate": 0.000184471218206158, + "loss": 4.2051, + "step": 59 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 2.195838212966919, + "learning_rate": 0.00018420348058902276, + "loss": 3.4561, + "step": 60 + }, + { + "epoch": 0.24497991967871485, + "grad_norm": 2.6737020015716553, + "learning_rate": 0.00018393574297188755, + "loss": 5.4281, + "step": 61 + }, + { + "epoch": 0.24899598393574296, + "grad_norm": 3.128307342529297, + "learning_rate": 0.00018366800535475234, + "loss": 4.835, + "step": 62 + }, + { + "epoch": 0.25301204819277107, + "grad_norm": 2.8915627002716064, + "learning_rate": 0.00018340026773761714, + "loss": 5.6513, + "step": 63 + }, + { + "epoch": 0.2570281124497992, + "grad_norm": 2.4325616359710693, + "learning_rate": 0.00018313253012048193, + "loss": 3.8769, + "step": 64 + }, + { + "epoch": 0.26104417670682734, + "grad_norm": 2.717306613922119, + "learning_rate": 0.00018286479250334675, + "loss": 4.7258, + "step": 65 + }, + { + "epoch": 0.26506024096385544, + "grad_norm": 2.6178746223449707, + "learning_rate": 0.00018259705488621152, + "loss": 4.0424, + "step": 66 + }, + { + "epoch": 0.26907630522088355, + "grad_norm": 2.382551431655884, + "learning_rate": 0.0001823293172690763, + "loss": 3.547, + "step": 67 + }, + { + "epoch": 0.27309236947791166, + "grad_norm": 2.546783685684204, + "learning_rate": 0.0001820615796519411, + "loss": 4.2495, + "step": 68 + }, + { + "epoch": 0.27710843373493976, + "grad_norm": 2.4738221168518066, + "learning_rate": 0.0001817938420348059, + "loss": 3.69, + "step": 69 + }, + { + "epoch": 0.28112449799196787, + "grad_norm": 2.2191786766052246, + "learning_rate": 0.0001815261044176707, + "loss": 3.1576, + "step": 70 + }, + { + "epoch": 0.285140562248996, + "grad_norm": 2.4891932010650635, + "learning_rate": 0.00018125836680053548, + "loss": 3.7767, + "step": 71 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 2.0602684020996094, + "learning_rate": 0.00018099062918340027, + "loss": 3.1497, + "step": 72 + }, + { + "epoch": 0.2931726907630522, + "grad_norm": 2.435455560684204, + "learning_rate": 0.00018072289156626507, + "loss": 4.3061, + "step": 73 + }, + { + "epoch": 0.2971887550200803, + "grad_norm": 2.7304036617279053, + "learning_rate": 0.00018045515394912986, + "loss": 3.6995, + "step": 74 + }, + { + "epoch": 0.30120481927710846, + "grad_norm": 2.6375226974487305, + "learning_rate": 0.00018018741633199465, + "loss": 3.3922, + "step": 75 + }, + { + "epoch": 0.30522088353413657, + "grad_norm": 2.097759246826172, + "learning_rate": 0.00017991967871485944, + "loss": 3.1887, + "step": 76 + }, + { + "epoch": 0.3092369477911647, + "grad_norm": 2.600724458694458, + "learning_rate": 0.00017965194109772424, + "loss": 3.8532, + "step": 77 + }, + { + "epoch": 0.3132530120481928, + "grad_norm": 3.0356369018554688, + "learning_rate": 0.00017938420348058903, + "loss": 4.6221, + "step": 78 + }, + { + "epoch": 0.3172690763052209, + "grad_norm": 2.1509416103363037, + "learning_rate": 0.00017911646586345382, + "loss": 3.5473, + "step": 79 + }, + { + "epoch": 0.321285140562249, + "grad_norm": 2.7542128562927246, + "learning_rate": 0.00017884872824631862, + "loss": 4.3206, + "step": 80 + }, + { + "epoch": 0.3253012048192771, + "grad_norm": 2.7480881214141846, + "learning_rate": 0.0001785809906291834, + "loss": 3.4596, + "step": 81 + }, + { + "epoch": 0.3293172690763052, + "grad_norm": 2.8787624835968018, + "learning_rate": 0.0001783132530120482, + "loss": 4.0409, + "step": 82 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.234320878982544, + "learning_rate": 0.000178045515394913, + "loss": 3.6684, + "step": 83 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 2.174452781677246, + "learning_rate": 0.00017777777777777779, + "loss": 3.8964, + "step": 84 + }, + { + "epoch": 0.3413654618473896, + "grad_norm": 2.25730299949646, + "learning_rate": 0.00017751004016064258, + "loss": 3.3793, + "step": 85 + }, + { + "epoch": 0.3453815261044177, + "grad_norm": 2.3120176792144775, + "learning_rate": 0.00017724230254350737, + "loss": 3.9183, + "step": 86 + }, + { + "epoch": 0.3493975903614458, + "grad_norm": 2.696288824081421, + "learning_rate": 0.00017697456492637216, + "loss": 4.1063, + "step": 87 + }, + { + "epoch": 0.3534136546184739, + "grad_norm": 3.9386634826660156, + "learning_rate": 0.00017670682730923696, + "loss": 4.599, + "step": 88 + }, + { + "epoch": 0.357429718875502, + "grad_norm": 2.7136473655700684, + "learning_rate": 0.00017643908969210175, + "loss": 4.1535, + "step": 89 + }, + { + "epoch": 0.3614457831325301, + "grad_norm": 2.4276645183563232, + "learning_rate": 0.00017617135207496654, + "loss": 4.4834, + "step": 90 + }, + { + "epoch": 0.3654618473895582, + "grad_norm": 2.6002511978149414, + "learning_rate": 0.00017590361445783134, + "loss": 4.0748, + "step": 91 + }, + { + "epoch": 0.36947791164658633, + "grad_norm": 2.682366132736206, + "learning_rate": 0.00017563587684069613, + "loss": 4.4142, + "step": 92 + }, + { + "epoch": 0.37349397590361444, + "grad_norm": 2.108722686767578, + "learning_rate": 0.00017536813922356092, + "loss": 4.4304, + "step": 93 + }, + { + "epoch": 0.37751004016064255, + "grad_norm": 2.0732803344726562, + "learning_rate": 0.00017510040160642571, + "loss": 3.2521, + "step": 94 + }, + { + "epoch": 0.3815261044176707, + "grad_norm": 2.3038790225982666, + "learning_rate": 0.0001748326639892905, + "loss": 4.3167, + "step": 95 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 2.623572587966919, + "learning_rate": 0.0001745649263721553, + "loss": 5.3465, + "step": 96 + }, + { + "epoch": 0.3895582329317269, + "grad_norm": 2.4543046951293945, + "learning_rate": 0.0001742971887550201, + "loss": 3.4479, + "step": 97 + }, + { + "epoch": 0.39357429718875503, + "grad_norm": 2.291369915008545, + "learning_rate": 0.00017402945113788489, + "loss": 4.0893, + "step": 98 + }, + { + "epoch": 0.39759036144578314, + "grad_norm": 2.4371914863586426, + "learning_rate": 0.00017376171352074968, + "loss": 3.7132, + "step": 99 + }, + { + "epoch": 0.40160642570281124, + "grad_norm": 2.1401989459991455, + "learning_rate": 0.00017349397590361447, + "loss": 2.9892, + "step": 100 + }, + { + "epoch": 0.40562248995983935, + "grad_norm": 2.1574857234954834, + "learning_rate": 0.00017322623828647926, + "loss": 3.3145, + "step": 101 + }, + { + "epoch": 0.40963855421686746, + "grad_norm": 2.7298076152801514, + "learning_rate": 0.00017295850066934406, + "loss": 4.2365, + "step": 102 + }, + { + "epoch": 0.41365461847389556, + "grad_norm": 2.5634846687316895, + "learning_rate": 0.00017269076305220885, + "loss": 3.4466, + "step": 103 + }, + { + "epoch": 0.41767068273092367, + "grad_norm": 2.573195695877075, + "learning_rate": 0.00017242302543507362, + "loss": 3.3283, + "step": 104 + }, + { + "epoch": 0.42168674698795183, + "grad_norm": 2.205293655395508, + "learning_rate": 0.00017215528781793844, + "loss": 3.7288, + "step": 105 + }, + { + "epoch": 0.42570281124497994, + "grad_norm": 3.3177073001861572, + "learning_rate": 0.00017188755020080323, + "loss": 3.9341, + "step": 106 + }, + { + "epoch": 0.42971887550200805, + "grad_norm": 2.601710557937622, + "learning_rate": 0.00017161981258366802, + "loss": 4.3724, + "step": 107 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 2.490556478500366, + "learning_rate": 0.00017135207496653281, + "loss": 3.0784, + "step": 108 + }, + { + "epoch": 0.43775100401606426, + "grad_norm": 2.7771122455596924, + "learning_rate": 0.0001710843373493976, + "loss": 3.7125, + "step": 109 + }, + { + "epoch": 0.44176706827309237, + "grad_norm": 2.9865031242370605, + "learning_rate": 0.00017081659973226237, + "loss": 4.9747, + "step": 110 + }, + { + "epoch": 0.4457831325301205, + "grad_norm": 3.2922353744506836, + "learning_rate": 0.00017054886211512717, + "loss": 4.229, + "step": 111 + }, + { + "epoch": 0.4497991967871486, + "grad_norm": 2.2360899448394775, + "learning_rate": 0.00017028112449799199, + "loss": 3.1859, + "step": 112 + }, + { + "epoch": 0.4538152610441767, + "grad_norm": 2.4282941818237305, + "learning_rate": 0.00017001338688085678, + "loss": 4.4577, + "step": 113 + }, + { + "epoch": 0.4578313253012048, + "grad_norm": 2.2384181022644043, + "learning_rate": 0.00016974564926372157, + "loss": 3.435, + "step": 114 + }, + { + "epoch": 0.46184738955823296, + "grad_norm": 2.586678981781006, + "learning_rate": 0.00016947791164658636, + "loss": 3.7974, + "step": 115 + }, + { + "epoch": 0.46586345381526106, + "grad_norm": 2.2473366260528564, + "learning_rate": 0.00016921017402945113, + "loss": 3.2193, + "step": 116 + }, + { + "epoch": 0.46987951807228917, + "grad_norm": 2.2137515544891357, + "learning_rate": 0.00016894243641231592, + "loss": 3.2774, + "step": 117 + }, + { + "epoch": 0.4738955823293173, + "grad_norm": 2.6827173233032227, + "learning_rate": 0.00016867469879518074, + "loss": 3.843, + "step": 118 + }, + { + "epoch": 0.4779116465863454, + "grad_norm": 2.499166250228882, + "learning_rate": 0.00016840696117804553, + "loss": 3.1818, + "step": 119 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 2.609964609146118, + "learning_rate": 0.00016813922356091033, + "loss": 3.6292, + "step": 120 + }, + { + "epoch": 0.4859437751004016, + "grad_norm": 2.697786808013916, + "learning_rate": 0.00016787148594377512, + "loss": 3.7501, + "step": 121 + }, + { + "epoch": 0.4899598393574297, + "grad_norm": 2.834494113922119, + "learning_rate": 0.00016760374832663989, + "loss": 3.9265, + "step": 122 + }, + { + "epoch": 0.4939759036144578, + "grad_norm": 2.3431777954101562, + "learning_rate": 0.00016733601070950468, + "loss": 3.7916, + "step": 123 + }, + { + "epoch": 0.4979919678714859, + "grad_norm": 2.434953212738037, + "learning_rate": 0.00016706827309236947, + "loss": 3.4279, + "step": 124 + }, + { + "epoch": 0.5020080321285141, + "grad_norm": 2.3629250526428223, + "learning_rate": 0.0001668005354752343, + "loss": 3.4382, + "step": 125 + }, + { + "epoch": 0.5060240963855421, + "grad_norm": 2.7543423175811768, + "learning_rate": 0.00016653279785809908, + "loss": 4.8146, + "step": 126 + }, + { + "epoch": 0.5100401606425703, + "grad_norm": 3.149775981903076, + "learning_rate": 0.00016626506024096388, + "loss": 5.365, + "step": 127 + }, + { + "epoch": 0.5140562248995983, + "grad_norm": 2.640326499938965, + "learning_rate": 0.00016599732262382864, + "loss": 4.2036, + "step": 128 + }, + { + "epoch": 0.5180722891566265, + "grad_norm": 2.6297357082366943, + "learning_rate": 0.00016572958500669344, + "loss": 3.7331, + "step": 129 + }, + { + "epoch": 0.5220883534136547, + "grad_norm": 2.9165263175964355, + "learning_rate": 0.00016546184738955823, + "loss": 4.2224, + "step": 130 + }, + { + "epoch": 0.5261044176706827, + "grad_norm": 2.003908634185791, + "learning_rate": 0.00016519410977242302, + "loss": 3.5818, + "step": 131 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 2.3137078285217285, + "learning_rate": 0.00016492637215528784, + "loss": 3.4726, + "step": 132 + }, + { + "epoch": 0.5341365461847389, + "grad_norm": 2.69950795173645, + "learning_rate": 0.00016465863453815263, + "loss": 4.0059, + "step": 133 + }, + { + "epoch": 0.5381526104417671, + "grad_norm": 2.1858394145965576, + "learning_rate": 0.0001643908969210174, + "loss": 3.6957, + "step": 134 + }, + { + "epoch": 0.5421686746987951, + "grad_norm": 2.423802137374878, + "learning_rate": 0.0001641231593038822, + "loss": 4.1535, + "step": 135 + }, + { + "epoch": 0.5461847389558233, + "grad_norm": 2.244253158569336, + "learning_rate": 0.00016385542168674699, + "loss": 3.3276, + "step": 136 + }, + { + "epoch": 0.5502008032128514, + "grad_norm": 2.2932465076446533, + "learning_rate": 0.00016358768406961178, + "loss": 3.6498, + "step": 137 + }, + { + "epoch": 0.5542168674698795, + "grad_norm": 2.0782933235168457, + "learning_rate": 0.0001633199464524766, + "loss": 4.007, + "step": 138 + }, + { + "epoch": 0.5582329317269076, + "grad_norm": 2.778797149658203, + "learning_rate": 0.0001630522088353414, + "loss": 3.8436, + "step": 139 + }, + { + "epoch": 0.5622489959839357, + "grad_norm": 2.7823002338409424, + "learning_rate": 0.00016278447121820616, + "loss": 5.5985, + "step": 140 + }, + { + "epoch": 0.5662650602409639, + "grad_norm": 3.124753475189209, + "learning_rate": 0.00016251673360107095, + "loss": 3.8402, + "step": 141 + }, + { + "epoch": 0.570281124497992, + "grad_norm": 2.999889612197876, + "learning_rate": 0.00016224899598393574, + "loss": 4.8463, + "step": 142 + }, + { + "epoch": 0.5742971887550201, + "grad_norm": 2.2176406383514404, + "learning_rate": 0.00016198125836680054, + "loss": 3.6488, + "step": 143 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 2.334336757659912, + "learning_rate": 0.00016171352074966533, + "loss": 3.4351, + "step": 144 + }, + { + "epoch": 0.5823293172690763, + "grad_norm": 2.1625120639801025, + "learning_rate": 0.00016144578313253015, + "loss": 3.4423, + "step": 145 + }, + { + "epoch": 0.5863453815261044, + "grad_norm": 2.3950042724609375, + "learning_rate": 0.00016117804551539491, + "loss": 3.4302, + "step": 146 + }, + { + "epoch": 0.5903614457831325, + "grad_norm": 1.968996524810791, + "learning_rate": 0.0001609103078982597, + "loss": 3.3924, + "step": 147 + }, + { + "epoch": 0.5943775100401606, + "grad_norm": 2.259298801422119, + "learning_rate": 0.0001606425702811245, + "loss": 3.4544, + "step": 148 + }, + { + "epoch": 0.5983935742971888, + "grad_norm": 2.5227410793304443, + "learning_rate": 0.0001603748326639893, + "loss": 3.6276, + "step": 149 + }, + { + "epoch": 0.6024096385542169, + "grad_norm": 2.4112424850463867, + "learning_rate": 0.00016010709504685409, + "loss": 3.8806, + "step": 150 + }, + { + "epoch": 0.606425702811245, + "grad_norm": 2.5478017330169678, + "learning_rate": 0.00015983935742971888, + "loss": 4.1461, + "step": 151 + }, + { + "epoch": 0.6104417670682731, + "grad_norm": 2.832744836807251, + "learning_rate": 0.00015957161981258367, + "loss": 5.0162, + "step": 152 + }, + { + "epoch": 0.6144578313253012, + "grad_norm": 2.7249608039855957, + "learning_rate": 0.00015930388219544846, + "loss": 3.2521, + "step": 153 + }, + { + "epoch": 0.6184738955823293, + "grad_norm": 2.579235315322876, + "learning_rate": 0.00015903614457831326, + "loss": 4.0444, + "step": 154 + }, + { + "epoch": 0.6224899598393574, + "grad_norm": 2.719031572341919, + "learning_rate": 0.00015876840696117805, + "loss": 3.8091, + "step": 155 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 2.9060187339782715, + "learning_rate": 0.00015850066934404284, + "loss": 3.574, + "step": 156 + }, + { + "epoch": 0.6305220883534136, + "grad_norm": 2.3890836238861084, + "learning_rate": 0.00015823293172690763, + "loss": 3.0126, + "step": 157 + }, + { + "epoch": 0.6345381526104418, + "grad_norm": 2.4875965118408203, + "learning_rate": 0.00015796519410977243, + "loss": 3.8722, + "step": 158 + }, + { + "epoch": 0.6385542168674698, + "grad_norm": 2.452133893966675, + "learning_rate": 0.00015769745649263722, + "loss": 3.1996, + "step": 159 + }, + { + "epoch": 0.642570281124498, + "grad_norm": 2.644927740097046, + "learning_rate": 0.000157429718875502, + "loss": 4.5955, + "step": 160 + }, + { + "epoch": 0.6465863453815262, + "grad_norm": 2.4523508548736572, + "learning_rate": 0.0001571619812583668, + "loss": 3.3654, + "step": 161 + }, + { + "epoch": 0.6506024096385542, + "grad_norm": 2.5598349571228027, + "learning_rate": 0.0001568942436412316, + "loss": 3.0078, + "step": 162 + }, + { + "epoch": 0.6546184738955824, + "grad_norm": 3.0518641471862793, + "learning_rate": 0.0001566265060240964, + "loss": 4.5464, + "step": 163 + }, + { + "epoch": 0.6586345381526104, + "grad_norm": 2.8101203441619873, + "learning_rate": 0.00015635876840696118, + "loss": 3.4404, + "step": 164 + }, + { + "epoch": 0.6626506024096386, + "grad_norm": 2.7174525260925293, + "learning_rate": 0.00015609103078982598, + "loss": 3.6615, + "step": 165 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.620638608932495, + "learning_rate": 0.00015582329317269077, + "loss": 3.448, + "step": 166 + }, + { + "epoch": 0.6706827309236948, + "grad_norm": 2.9395246505737305, + "learning_rate": 0.00015555555555555556, + "loss": 3.6454, + "step": 167 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 3.050710916519165, + "learning_rate": 0.00015528781793842036, + "loss": 4.0765, + "step": 168 + }, + { + "epoch": 0.678714859437751, + "grad_norm": 2.2552433013916016, + "learning_rate": 0.00015502008032128515, + "loss": 3.1558, + "step": 169 + }, + { + "epoch": 0.6827309236947792, + "grad_norm": 2.1489574909210205, + "learning_rate": 0.00015475234270414994, + "loss": 4.2047, + "step": 170 + }, + { + "epoch": 0.6867469879518072, + "grad_norm": 2.172776937484741, + "learning_rate": 0.00015448460508701473, + "loss": 3.4285, + "step": 171 + }, + { + "epoch": 0.6907630522088354, + "grad_norm": 2.1401731967926025, + "learning_rate": 0.00015421686746987953, + "loss": 3.2497, + "step": 172 + }, + { + "epoch": 0.6947791164658634, + "grad_norm": 2.7701947689056396, + "learning_rate": 0.00015394912985274432, + "loss": 3.9331, + "step": 173 + }, + { + "epoch": 0.6987951807228916, + "grad_norm": 2.319415330886841, + "learning_rate": 0.0001536813922356091, + "loss": 3.176, + "step": 174 + }, + { + "epoch": 0.7028112449799196, + "grad_norm": 2.428131341934204, + "learning_rate": 0.0001534136546184739, + "loss": 3.1192, + "step": 175 + }, + { + "epoch": 0.7068273092369478, + "grad_norm": 2.135892868041992, + "learning_rate": 0.0001531459170013387, + "loss": 3.0222, + "step": 176 + }, + { + "epoch": 0.7108433734939759, + "grad_norm": 2.7550647258758545, + "learning_rate": 0.0001528781793842035, + "loss": 4.6775, + "step": 177 + }, + { + "epoch": 0.714859437751004, + "grad_norm": 2.2021191120147705, + "learning_rate": 0.00015261044176706828, + "loss": 2.7476, + "step": 178 + }, + { + "epoch": 0.7188755020080321, + "grad_norm": 2.686431407928467, + "learning_rate": 0.00015234270414993308, + "loss": 4.1621, + "step": 179 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 2.827143669128418, + "learning_rate": 0.00015207496653279787, + "loss": 4.4613, + "step": 180 + }, + { + "epoch": 0.7269076305220884, + "grad_norm": 3.090308904647827, + "learning_rate": 0.00015180722891566266, + "loss": 4.6863, + "step": 181 + }, + { + "epoch": 0.7309236947791165, + "grad_norm": 2.492013454437256, + "learning_rate": 0.00015153949129852746, + "loss": 3.2319, + "step": 182 + }, + { + "epoch": 0.7349397590361446, + "grad_norm": 2.6304264068603516, + "learning_rate": 0.00015127175368139225, + "loss": 3.3099, + "step": 183 + }, + { + "epoch": 0.7389558232931727, + "grad_norm": 2.270024299621582, + "learning_rate": 0.00015100401606425701, + "loss": 3.8332, + "step": 184 + }, + { + "epoch": 0.7429718875502008, + "grad_norm": 2.2107675075531006, + "learning_rate": 0.00015073627844712183, + "loss": 3.4966, + "step": 185 + }, + { + "epoch": 0.7469879518072289, + "grad_norm": 1.804654598236084, + "learning_rate": 0.00015046854082998663, + "loss": 2.7441, + "step": 186 + }, + { + "epoch": 0.751004016064257, + "grad_norm": 2.8919899463653564, + "learning_rate": 0.00015020080321285142, + "loss": 3.7274, + "step": 187 + }, + { + "epoch": 0.7550200803212851, + "grad_norm": 2.4757237434387207, + "learning_rate": 0.0001499330655957162, + "loss": 3.6959, + "step": 188 + }, + { + "epoch": 0.7590361445783133, + "grad_norm": 2.037745952606201, + "learning_rate": 0.000149665327978581, + "loss": 3.0673, + "step": 189 + }, + { + "epoch": 0.7630522088353414, + "grad_norm": 2.479806423187256, + "learning_rate": 0.00014939759036144577, + "loss": 3.5497, + "step": 190 + }, + { + "epoch": 0.7670682730923695, + "grad_norm": 2.532616138458252, + "learning_rate": 0.00014912985274431056, + "loss": 4.4538, + "step": 191 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 2.2965128421783447, + "learning_rate": 0.00014886211512717538, + "loss": 3.8924, + "step": 192 + }, + { + "epoch": 0.7751004016064257, + "grad_norm": 2.569096088409424, + "learning_rate": 0.00014859437751004018, + "loss": 4.3112, + "step": 193 + }, + { + "epoch": 0.7791164658634538, + "grad_norm": 2.3299782276153564, + "learning_rate": 0.00014832663989290497, + "loss": 3.4171, + "step": 194 + }, + { + "epoch": 0.7831325301204819, + "grad_norm": 2.4750306606292725, + "learning_rate": 0.00014805890227576976, + "loss": 4.2418, + "step": 195 + }, + { + "epoch": 0.7871485943775101, + "grad_norm": 2.34830904006958, + "learning_rate": 0.00014779116465863453, + "loss": 4.7654, + "step": 196 + }, + { + "epoch": 0.7911646586345381, + "grad_norm": 2.3084421157836914, + "learning_rate": 0.00014752342704149932, + "loss": 3.5955, + "step": 197 + }, + { + "epoch": 0.7951807228915663, + "grad_norm": 2.088836431503296, + "learning_rate": 0.00014725568942436414, + "loss": 3.4426, + "step": 198 + }, + { + "epoch": 0.7991967871485943, + "grad_norm": 2.387511968612671, + "learning_rate": 0.00014698795180722893, + "loss": 3.4799, + "step": 199 + }, + { + "epoch": 0.8032128514056225, + "grad_norm": 2.173638343811035, + "learning_rate": 0.00014672021419009373, + "loss": 3.1073, + "step": 200 + }, + { + "epoch": 0.8072289156626506, + "grad_norm": 2.4268410205841064, + "learning_rate": 0.00014645247657295852, + "loss": 3.895, + "step": 201 + }, + { + "epoch": 0.8112449799196787, + "grad_norm": 2.298238515853882, + "learning_rate": 0.00014618473895582328, + "loss": 3.1374, + "step": 202 + }, + { + "epoch": 0.8152610441767069, + "grad_norm": 2.5447280406951904, + "learning_rate": 0.00014591700133868808, + "loss": 4.201, + "step": 203 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 2.2700531482696533, + "learning_rate": 0.00014564926372155287, + "loss": 3.3756, + "step": 204 + }, + { + "epoch": 0.8232931726907631, + "grad_norm": 2.2147793769836426, + "learning_rate": 0.0001453815261044177, + "loss": 2.8677, + "step": 205 + }, + { + "epoch": 0.8273092369477911, + "grad_norm": 2.820615768432617, + "learning_rate": 0.00014511378848728248, + "loss": 3.8278, + "step": 206 + }, + { + "epoch": 0.8313253012048193, + "grad_norm": 2.214066743850708, + "learning_rate": 0.00014484605087014728, + "loss": 2.8015, + "step": 207 + }, + { + "epoch": 0.8353413654618473, + "grad_norm": 2.7223362922668457, + "learning_rate": 0.00014457831325301204, + "loss": 4.5482, + "step": 208 + }, + { + "epoch": 0.8393574297188755, + "grad_norm": 2.6131458282470703, + "learning_rate": 0.00014431057563587683, + "loss": 3.258, + "step": 209 + }, + { + "epoch": 0.8433734939759037, + "grad_norm": 2.378821611404419, + "learning_rate": 0.00014404283801874163, + "loss": 3.4395, + "step": 210 + }, + { + "epoch": 0.8473895582329317, + "grad_norm": 2.5394039154052734, + "learning_rate": 0.00014377510040160642, + "loss": 3.5583, + "step": 211 + }, + { + "epoch": 0.8514056224899599, + "grad_norm": 2.8768603801727295, + "learning_rate": 0.00014350736278447124, + "loss": 4.1826, + "step": 212 + }, + { + "epoch": 0.8554216867469879, + "grad_norm": 2.325242757797241, + "learning_rate": 0.00014323962516733603, + "loss": 3.2996, + "step": 213 + }, + { + "epoch": 0.8594377510040161, + "grad_norm": 2.847722053527832, + "learning_rate": 0.0001429718875502008, + "loss": 3.7535, + "step": 214 + }, + { + "epoch": 0.8634538152610441, + "grad_norm": 2.3787224292755127, + "learning_rate": 0.0001427041499330656, + "loss": 2.989, + "step": 215 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 2.3759453296661377, + "learning_rate": 0.00014243641231593038, + "loss": 3.2181, + "step": 216 + }, + { + "epoch": 0.8714859437751004, + "grad_norm": 2.48319411277771, + "learning_rate": 0.00014216867469879518, + "loss": 4.0624, + "step": 217 + }, + { + "epoch": 0.8755020080321285, + "grad_norm": 2.75231671333313, + "learning_rate": 0.00014190093708166, + "loss": 4.2616, + "step": 218 + }, + { + "epoch": 0.8795180722891566, + "grad_norm": 2.165195941925049, + "learning_rate": 0.0001416331994645248, + "loss": 2.773, + "step": 219 + }, + { + "epoch": 0.8835341365461847, + "grad_norm": 2.9390523433685303, + "learning_rate": 0.00014136546184738956, + "loss": 5.3133, + "step": 220 + }, + { + "epoch": 0.8875502008032129, + "grad_norm": 2.4109458923339844, + "learning_rate": 0.00014109772423025435, + "loss": 3.8292, + "step": 221 + }, + { + "epoch": 0.891566265060241, + "grad_norm": 2.5037901401519775, + "learning_rate": 0.00014082998661311914, + "loss": 4.0122, + "step": 222 + }, + { + "epoch": 0.8955823293172691, + "grad_norm": 2.985944986343384, + "learning_rate": 0.00014056224899598393, + "loss": 3.7539, + "step": 223 + }, + { + "epoch": 0.8995983935742972, + "grad_norm": 2.2456915378570557, + "learning_rate": 0.00014029451137884873, + "loss": 3.4707, + "step": 224 + }, + { + "epoch": 0.9036144578313253, + "grad_norm": 2.0935449600219727, + "learning_rate": 0.00014002677376171355, + "loss": 2.7515, + "step": 225 + }, + { + "epoch": 0.9076305220883534, + "grad_norm": 2.4609766006469727, + "learning_rate": 0.00013975903614457834, + "loss": 3.8227, + "step": 226 + }, + { + "epoch": 0.9116465863453815, + "grad_norm": 2.2097980976104736, + "learning_rate": 0.0001394912985274431, + "loss": 3.2733, + "step": 227 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 2.0642688274383545, + "learning_rate": 0.0001392235609103079, + "loss": 3.0938, + "step": 228 + }, + { + "epoch": 0.9196787148594378, + "grad_norm": 2.3710100650787354, + "learning_rate": 0.0001389558232931727, + "loss": 4.2002, + "step": 229 + }, + { + "epoch": 0.9236947791164659, + "grad_norm": 2.6360647678375244, + "learning_rate": 0.00013868808567603748, + "loss": 3.8326, + "step": 230 + }, + { + "epoch": 0.927710843373494, + "grad_norm": 2.2522687911987305, + "learning_rate": 0.00013842034805890228, + "loss": 4.0576, + "step": 231 + }, + { + "epoch": 0.9317269076305221, + "grad_norm": 2.3965373039245605, + "learning_rate": 0.0001381526104417671, + "loss": 2.551, + "step": 232 + }, + { + "epoch": 0.9357429718875502, + "grad_norm": 2.160850763320923, + "learning_rate": 0.00013788487282463186, + "loss": 3.0346, + "step": 233 + }, + { + "epoch": 0.9397590361445783, + "grad_norm": 2.7340362071990967, + "learning_rate": 0.00013761713520749665, + "loss": 3.8792, + "step": 234 + }, + { + "epoch": 0.9437751004016064, + "grad_norm": 2.373431921005249, + "learning_rate": 0.00013734939759036145, + "loss": 3.4563, + "step": 235 + }, + { + "epoch": 0.9477911646586346, + "grad_norm": 2.887669801712036, + "learning_rate": 0.00013708165997322624, + "loss": 3.4205, + "step": 236 + }, + { + "epoch": 0.9518072289156626, + "grad_norm": 2.47088360786438, + "learning_rate": 0.00013681392235609103, + "loss": 3.7738, + "step": 237 + }, + { + "epoch": 0.9558232931726908, + "grad_norm": 2.7040438652038574, + "learning_rate": 0.00013654618473895585, + "loss": 3.5389, + "step": 238 + }, + { + "epoch": 0.9598393574297188, + "grad_norm": 2.2656071186065674, + "learning_rate": 0.00013627844712182062, + "loss": 2.5192, + "step": 239 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 2.0689640045166016, + "learning_rate": 0.0001360107095046854, + "loss": 3.2038, + "step": 240 + }, + { + "epoch": 0.9678714859437751, + "grad_norm": 2.456049680709839, + "learning_rate": 0.0001357429718875502, + "loss": 3.3779, + "step": 241 + }, + { + "epoch": 0.9718875502008032, + "grad_norm": 3.6520512104034424, + "learning_rate": 0.000135475234270415, + "loss": 6.3828, + "step": 242 + }, + { + "epoch": 0.9759036144578314, + "grad_norm": 2.9019930362701416, + "learning_rate": 0.0001352074966532798, + "loss": 4.4033, + "step": 243 + }, + { + "epoch": 0.9799196787148594, + "grad_norm": 2.688805103302002, + "learning_rate": 0.00013493975903614458, + "loss": 3.7718, + "step": 244 + }, + { + "epoch": 0.9839357429718876, + "grad_norm": 2.3583173751831055, + "learning_rate": 0.00013467202141900938, + "loss": 2.8558, + "step": 245 + }, + { + "epoch": 0.9879518072289156, + "grad_norm": 2.2991857528686523, + "learning_rate": 0.00013440428380187417, + "loss": 3.3544, + "step": 246 + }, + { + "epoch": 0.9919678714859438, + "grad_norm": 2.3462352752685547, + "learning_rate": 0.00013413654618473896, + "loss": 3.4804, + "step": 247 + }, + { + "epoch": 0.9959839357429718, + "grad_norm": 2.375304698944092, + "learning_rate": 0.00013386880856760375, + "loss": 3.9284, + "step": 248 + }, + { + "epoch": 1.0, + "grad_norm": 2.3574721813201904, + "learning_rate": 0.00013360107095046855, + "loss": 3.5948, + "step": 249 + }, + { + "epoch": 1.0, + "eval_loss": 0.906198263168335, + "eval_runtime": 202.0311, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 1.237, + "step": 249 + }, + { + "epoch": 1.0040160642570282, + "grad_norm": 2.329230546951294, + "learning_rate": 0.00013333333333333334, + "loss": 3.8794, + "step": 250 + }, + { + "epoch": 1.0080321285140563, + "grad_norm": 2.304131507873535, + "learning_rate": 0.00013306559571619813, + "loss": 2.618, + "step": 251 + }, + { + "epoch": 1.0120481927710843, + "grad_norm": 2.258854389190674, + "learning_rate": 0.00013279785809906293, + "loss": 4.5112, + "step": 252 + }, + { + "epoch": 1.0160642570281124, + "grad_norm": 1.9307198524475098, + "learning_rate": 0.00013253012048192772, + "loss": 2.8023, + "step": 253 + }, + { + "epoch": 1.0200803212851406, + "grad_norm": 2.070939540863037, + "learning_rate": 0.0001322623828647925, + "loss": 2.9067, + "step": 254 + }, + { + "epoch": 1.0240963855421688, + "grad_norm": 2.1403632164001465, + "learning_rate": 0.0001319946452476573, + "loss": 3.0498, + "step": 255 + }, + { + "epoch": 1.0281124497991967, + "grad_norm": 1.9982527494430542, + "learning_rate": 0.0001317269076305221, + "loss": 2.7652, + "step": 256 + }, + { + "epoch": 1.0321285140562249, + "grad_norm": 2.3440232276916504, + "learning_rate": 0.0001314591700133869, + "loss": 3.8854, + "step": 257 + }, + { + "epoch": 1.036144578313253, + "grad_norm": 2.3406286239624023, + "learning_rate": 0.00013119143239625168, + "loss": 2.9114, + "step": 258 + }, + { + "epoch": 1.0401606425702812, + "grad_norm": 2.673793077468872, + "learning_rate": 0.00013092369477911648, + "loss": 3.0531, + "step": 259 + }, + { + "epoch": 1.0441767068273093, + "grad_norm": 2.2808480262756348, + "learning_rate": 0.00013065595716198127, + "loss": 2.9484, + "step": 260 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 2.513705253601074, + "learning_rate": 0.00013038821954484606, + "loss": 2.6625, + "step": 261 + }, + { + "epoch": 1.0522088353413654, + "grad_norm": 2.7780377864837646, + "learning_rate": 0.00013012048192771085, + "loss": 3.1793, + "step": 262 + }, + { + "epoch": 1.0562248995983936, + "grad_norm": 2.522724151611328, + "learning_rate": 0.00012985274431057565, + "loss": 3.1926, + "step": 263 + }, + { + "epoch": 1.0602409638554218, + "grad_norm": 3.2487499713897705, + "learning_rate": 0.0001295850066934404, + "loss": 3.9779, + "step": 264 + }, + { + "epoch": 1.0642570281124497, + "grad_norm": 2.4341378211975098, + "learning_rate": 0.00012931726907630523, + "loss": 2.9064, + "step": 265 + }, + { + "epoch": 1.0682730923694779, + "grad_norm": 2.5539276599884033, + "learning_rate": 0.00012904953145917002, + "loss": 3.4219, + "step": 266 + }, + { + "epoch": 1.072289156626506, + "grad_norm": 2.0425596237182617, + "learning_rate": 0.00012878179384203482, + "loss": 2.5395, + "step": 267 + }, + { + "epoch": 1.0763052208835342, + "grad_norm": 2.3625378608703613, + "learning_rate": 0.0001285140562248996, + "loss": 2.757, + "step": 268 + }, + { + "epoch": 1.0803212851405624, + "grad_norm": 2.0414483547210693, + "learning_rate": 0.0001282463186077644, + "loss": 2.7764, + "step": 269 + }, + { + "epoch": 1.0843373493975903, + "grad_norm": 3.544743061065674, + "learning_rate": 0.00012797858099062917, + "loss": 3.6176, + "step": 270 + }, + { + "epoch": 1.0883534136546185, + "grad_norm": 2.4814655780792236, + "learning_rate": 0.00012771084337349396, + "loss": 3.2284, + "step": 271 + }, + { + "epoch": 1.0923694779116466, + "grad_norm": 2.364025592803955, + "learning_rate": 0.00012744310575635878, + "loss": 3.6178, + "step": 272 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 1.989912748336792, + "learning_rate": 0.00012717536813922357, + "loss": 2.5839, + "step": 273 + }, + { + "epoch": 1.1004016064257027, + "grad_norm": 2.413421154022217, + "learning_rate": 0.00012690763052208837, + "loss": 3.5416, + "step": 274 + }, + { + "epoch": 1.104417670682731, + "grad_norm": 2.679314613342285, + "learning_rate": 0.00012663989290495316, + "loss": 3.0015, + "step": 275 + }, + { + "epoch": 1.108433734939759, + "grad_norm": 2.2354209423065186, + "learning_rate": 0.00012637215528781793, + "loss": 3.3867, + "step": 276 + }, + { + "epoch": 1.1124497991967872, + "grad_norm": 2.4003982543945312, + "learning_rate": 0.00012610441767068272, + "loss": 3.0927, + "step": 277 + }, + { + "epoch": 1.1164658634538154, + "grad_norm": 2.2922661304473877, + "learning_rate": 0.00012583668005354754, + "loss": 2.835, + "step": 278 + }, + { + "epoch": 1.1204819277108433, + "grad_norm": 2.1880528926849365, + "learning_rate": 0.00012556894243641233, + "loss": 2.9581, + "step": 279 + }, + { + "epoch": 1.1244979919678715, + "grad_norm": 2.5255534648895264, + "learning_rate": 0.00012530120481927712, + "loss": 2.7931, + "step": 280 + }, + { + "epoch": 1.1285140562248996, + "grad_norm": 2.2529118061065674, + "learning_rate": 0.00012503346720214192, + "loss": 2.6831, + "step": 281 + }, + { + "epoch": 1.1325301204819278, + "grad_norm": 2.2123444080352783, + "learning_rate": 0.0001247657295850067, + "loss": 2.8091, + "step": 282 + }, + { + "epoch": 1.1365461847389557, + "grad_norm": 2.538160800933838, + "learning_rate": 0.00012449799196787148, + "loss": 3.0089, + "step": 283 + }, + { + "epoch": 1.140562248995984, + "grad_norm": 3.0052592754364014, + "learning_rate": 0.00012423025435073627, + "loss": 3.9042, + "step": 284 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 2.691096067428589, + "learning_rate": 0.0001239625167336011, + "loss": 3.9491, + "step": 285 + }, + { + "epoch": 1.1485943775100402, + "grad_norm": 2.6101088523864746, + "learning_rate": 0.00012369477911646588, + "loss": 2.9432, + "step": 286 + }, + { + "epoch": 1.1526104417670684, + "grad_norm": 2.368319511413574, + "learning_rate": 0.00012342704149933067, + "loss": 2.966, + "step": 287 + }, + { + "epoch": 1.1566265060240963, + "grad_norm": 2.4615232944488525, + "learning_rate": 0.00012315930388219547, + "loss": 3.4359, + "step": 288 + }, + { + "epoch": 1.1606425702811245, + "grad_norm": 2.3296902179718018, + "learning_rate": 0.00012289156626506023, + "loss": 3.0168, + "step": 289 + }, + { + "epoch": 1.1646586345381527, + "grad_norm": 2.7844183444976807, + "learning_rate": 0.00012262382864792503, + "loss": 3.1574, + "step": 290 + }, + { + "epoch": 1.1686746987951806, + "grad_norm": 2.486553430557251, + "learning_rate": 0.00012235609103078982, + "loss": 3.1044, + "step": 291 + }, + { + "epoch": 1.1726907630522088, + "grad_norm": 2.4482836723327637, + "learning_rate": 0.00012208835341365464, + "loss": 3.2606, + "step": 292 + }, + { + "epoch": 1.176706827309237, + "grad_norm": 2.393049955368042, + "learning_rate": 0.00012182061579651942, + "loss": 2.9026, + "step": 293 + }, + { + "epoch": 1.180722891566265, + "grad_norm": 2.8396050930023193, + "learning_rate": 0.00012155287817938421, + "loss": 2.9787, + "step": 294 + }, + { + "epoch": 1.1847389558232932, + "grad_norm": 2.447458028793335, + "learning_rate": 0.000121285140562249, + "loss": 2.6885, + "step": 295 + }, + { + "epoch": 1.1887550200803212, + "grad_norm": 2.3094258308410645, + "learning_rate": 0.0001210174029451138, + "loss": 2.9401, + "step": 296 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 2.5315654277801514, + "learning_rate": 0.00012074966532797858, + "loss": 3.2829, + "step": 297 + }, + { + "epoch": 1.1967871485943775, + "grad_norm": 2.4781811237335205, + "learning_rate": 0.0001204819277108434, + "loss": 2.9542, + "step": 298 + }, + { + "epoch": 1.2008032128514057, + "grad_norm": 2.759524345397949, + "learning_rate": 0.00012021419009370817, + "loss": 3.5029, + "step": 299 + }, + { + "epoch": 1.2048192771084336, + "grad_norm": 2.388485908508301, + "learning_rate": 0.00011994645247657297, + "loss": 2.6706, + "step": 300 + }, + { + "epoch": 1.2088353413654618, + "grad_norm": 2.5414671897888184, + "learning_rate": 0.00011967871485943776, + "loss": 2.7898, + "step": 301 + }, + { + "epoch": 1.21285140562249, + "grad_norm": 3.36741042137146, + "learning_rate": 0.00011941097724230255, + "loss": 2.7475, + "step": 302 + }, + { + "epoch": 1.216867469879518, + "grad_norm": 2.7749950885772705, + "learning_rate": 0.00011914323962516733, + "loss": 2.9617, + "step": 303 + }, + { + "epoch": 1.2208835341365463, + "grad_norm": 2.685976505279541, + "learning_rate": 0.00011887550200803212, + "loss": 3.2493, + "step": 304 + }, + { + "epoch": 1.2248995983935742, + "grad_norm": 2.7357215881347656, + "learning_rate": 0.00011860776439089693, + "loss": 2.7249, + "step": 305 + }, + { + "epoch": 1.2289156626506024, + "grad_norm": 2.962019443511963, + "learning_rate": 0.00011834002677376172, + "loss": 3.4647, + "step": 306 + }, + { + "epoch": 1.2329317269076305, + "grad_norm": 2.891343832015991, + "learning_rate": 0.00011807228915662652, + "loss": 3.5527, + "step": 307 + }, + { + "epoch": 1.2369477911646587, + "grad_norm": 2.7382125854492188, + "learning_rate": 0.00011780455153949131, + "loss": 3.1955, + "step": 308 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 2.385486602783203, + "learning_rate": 0.00011753681392235609, + "loss": 3.022, + "step": 309 + }, + { + "epoch": 1.2449799196787148, + "grad_norm": 2.553295612335205, + "learning_rate": 0.00011726907630522088, + "loss": 2.801, + "step": 310 + }, + { + "epoch": 1.248995983935743, + "grad_norm": 2.9965014457702637, + "learning_rate": 0.00011700133868808567, + "loss": 2.4453, + "step": 311 + }, + { + "epoch": 1.2530120481927711, + "grad_norm": 2.327629566192627, + "learning_rate": 0.00011673360107095048, + "loss": 2.2897, + "step": 312 + }, + { + "epoch": 1.2570281124497993, + "grad_norm": 2.7544825077056885, + "learning_rate": 0.00011646586345381527, + "loss": 3.2796, + "step": 313 + }, + { + "epoch": 1.2610441767068274, + "grad_norm": 2.590733051300049, + "learning_rate": 0.00011619812583668007, + "loss": 2.9126, + "step": 314 + }, + { + "epoch": 1.2650602409638554, + "grad_norm": 3.3064663410186768, + "learning_rate": 0.00011593038821954485, + "loss": 3.6784, + "step": 315 + }, + { + "epoch": 1.2690763052208835, + "grad_norm": 3.3928616046905518, + "learning_rate": 0.00011566265060240964, + "loss": 3.3292, + "step": 316 + }, + { + "epoch": 1.2730923694779117, + "grad_norm": 2.6576473712921143, + "learning_rate": 0.00011539491298527443, + "loss": 3.0617, + "step": 317 + }, + { + "epoch": 1.2771084337349397, + "grad_norm": 2.5956337451934814, + "learning_rate": 0.00011512717536813924, + "loss": 2.9754, + "step": 318 + }, + { + "epoch": 1.2811244979919678, + "grad_norm": 2.8080995082855225, + "learning_rate": 0.00011485943775100403, + "loss": 3.1712, + "step": 319 + }, + { + "epoch": 1.285140562248996, + "grad_norm": 2.4304864406585693, + "learning_rate": 0.00011459170013386882, + "loss": 3.0387, + "step": 320 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 2.2777411937713623, + "learning_rate": 0.0001143239625167336, + "loss": 2.8357, + "step": 321 + }, + { + "epoch": 1.2931726907630523, + "grad_norm": 2.370192289352417, + "learning_rate": 0.0001140562248995984, + "loss": 2.5937, + "step": 322 + }, + { + "epoch": 1.2971887550200802, + "grad_norm": 3.0521585941314697, + "learning_rate": 0.00011378848728246319, + "loss": 4.4271, + "step": 323 + }, + { + "epoch": 1.3012048192771084, + "grad_norm": 2.4153242111206055, + "learning_rate": 0.00011352074966532798, + "loss": 2.7952, + "step": 324 + }, + { + "epoch": 1.3052208835341366, + "grad_norm": 2.629312038421631, + "learning_rate": 0.00011325301204819279, + "loss": 3.6324, + "step": 325 + }, + { + "epoch": 1.3092369477911647, + "grad_norm": 2.0146517753601074, + "learning_rate": 0.00011298527443105758, + "loss": 2.3154, + "step": 326 + }, + { + "epoch": 1.3132530120481927, + "grad_norm": 2.3414394855499268, + "learning_rate": 0.00011271753681392236, + "loss": 2.809, + "step": 327 + }, + { + "epoch": 1.3172690763052208, + "grad_norm": 2.366577386856079, + "learning_rate": 0.00011244979919678715, + "loss": 3.7852, + "step": 328 + }, + { + "epoch": 1.321285140562249, + "grad_norm": 2.661543130874634, + "learning_rate": 0.00011218206157965195, + "loss": 2.818, + "step": 329 + }, + { + "epoch": 1.3253012048192772, + "grad_norm": 2.51835036277771, + "learning_rate": 0.00011191432396251674, + "loss": 2.8359, + "step": 330 + }, + { + "epoch": 1.3293172690763053, + "grad_norm": 2.473179817199707, + "learning_rate": 0.00011164658634538152, + "loss": 2.8498, + "step": 331 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.9637928009033203, + "learning_rate": 0.00011137884872824634, + "loss": 4.164, + "step": 332 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 2.5028486251831055, + "learning_rate": 0.00011111111111111112, + "loss": 3.6701, + "step": 333 + }, + { + "epoch": 1.3413654618473896, + "grad_norm": 3.149928092956543, + "learning_rate": 0.00011084337349397591, + "loss": 3.7949, + "step": 334 + }, + { + "epoch": 1.3453815261044177, + "grad_norm": 2.7405877113342285, + "learning_rate": 0.0001105756358768407, + "loss": 3.2064, + "step": 335 + }, + { + "epoch": 1.3493975903614457, + "grad_norm": 2.830744743347168, + "learning_rate": 0.0001103078982597055, + "loss": 2.8919, + "step": 336 + }, + { + "epoch": 1.3534136546184738, + "grad_norm": 2.9335427284240723, + "learning_rate": 0.00011004016064257027, + "loss": 3.1013, + "step": 337 + }, + { + "epoch": 1.357429718875502, + "grad_norm": 2.505171537399292, + "learning_rate": 0.0001097724230254351, + "loss": 3.206, + "step": 338 + }, + { + "epoch": 1.3614457831325302, + "grad_norm": 3.127634286880493, + "learning_rate": 0.00010950468540829987, + "loss": 3.2454, + "step": 339 + }, + { + "epoch": 1.3654618473895583, + "grad_norm": 2.7009451389312744, + "learning_rate": 0.00010923694779116467, + "loss": 3.0679, + "step": 340 + }, + { + "epoch": 1.3694779116465863, + "grad_norm": 2.3906707763671875, + "learning_rate": 0.00010896921017402946, + "loss": 3.7267, + "step": 341 + }, + { + "epoch": 1.3734939759036144, + "grad_norm": 2.4884233474731445, + "learning_rate": 0.00010870147255689425, + "loss": 3.2707, + "step": 342 + }, + { + "epoch": 1.3775100401606426, + "grad_norm": 2.514148712158203, + "learning_rate": 0.00010843373493975903, + "loss": 3.0734, + "step": 343 + }, + { + "epoch": 1.3815261044176708, + "grad_norm": 2.450438976287842, + "learning_rate": 0.00010816599732262382, + "loss": 2.7529, + "step": 344 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 2.5931103229522705, + "learning_rate": 0.00010789825970548863, + "loss": 3.8578, + "step": 345 + }, + { + "epoch": 1.3895582329317269, + "grad_norm": 2.386543035507202, + "learning_rate": 0.00010763052208835342, + "loss": 3.2145, + "step": 346 + }, + { + "epoch": 1.393574297188755, + "grad_norm": 2.643378973007202, + "learning_rate": 0.00010736278447121822, + "loss": 2.7853, + "step": 347 + }, + { + "epoch": 1.3975903614457832, + "grad_norm": 1.9885903596878052, + "learning_rate": 0.00010709504685408301, + "loss": 2.2022, + "step": 348 + }, + { + "epoch": 1.4016064257028114, + "grad_norm": 2.6465091705322266, + "learning_rate": 0.00010682730923694779, + "loss": 3.5565, + "step": 349 + }, + { + "epoch": 1.4056224899598393, + "grad_norm": 2.6052937507629395, + "learning_rate": 0.00010655957161981258, + "loss": 2.9741, + "step": 350 + }, + { + "epoch": 1.4096385542168675, + "grad_norm": 2.7112314701080322, + "learning_rate": 0.00010629183400267737, + "loss": 4.0259, + "step": 351 + }, + { + "epoch": 1.4136546184738956, + "grad_norm": 2.5356833934783936, + "learning_rate": 0.00010602409638554218, + "loss": 2.6879, + "step": 352 + }, + { + "epoch": 1.4176706827309236, + "grad_norm": 2.745176315307617, + "learning_rate": 0.00010575635876840697, + "loss": 4.0105, + "step": 353 + }, + { + "epoch": 1.4216867469879517, + "grad_norm": 2.5344765186309814, + "learning_rate": 0.00010548862115127177, + "loss": 2.9797, + "step": 354 + }, + { + "epoch": 1.4257028112449799, + "grad_norm": 2.680912733078003, + "learning_rate": 0.00010522088353413654, + "loss": 3.3971, + "step": 355 + }, + { + "epoch": 1.429718875502008, + "grad_norm": 3.498023271560669, + "learning_rate": 0.00010495314591700134, + "loss": 3.6706, + "step": 356 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 2.4419398307800293, + "learning_rate": 0.00010468540829986613, + "loss": 2.6477, + "step": 357 + }, + { + "epoch": 1.4377510040160644, + "grad_norm": 3.2264997959136963, + "learning_rate": 0.00010441767068273094, + "loss": 4.5181, + "step": 358 + }, + { + "epoch": 1.4417670682730923, + "grad_norm": 2.5578315258026123, + "learning_rate": 0.00010414993306559573, + "loss": 2.6282, + "step": 359 + }, + { + "epoch": 1.4457831325301205, + "grad_norm": 2.539045572280884, + "learning_rate": 0.00010388219544846052, + "loss": 2.6435, + "step": 360 + }, + { + "epoch": 1.4497991967871486, + "grad_norm": 2.9697344303131104, + "learning_rate": 0.0001036144578313253, + "loss": 2.6676, + "step": 361 + }, + { + "epoch": 1.4538152610441766, + "grad_norm": 2.606131076812744, + "learning_rate": 0.0001033467202141901, + "loss": 2.9316, + "step": 362 + }, + { + "epoch": 1.4578313253012047, + "grad_norm": 3.290837049484253, + "learning_rate": 0.00010307898259705489, + "loss": 3.0869, + "step": 363 + }, + { + "epoch": 1.461847389558233, + "grad_norm": 2.331320285797119, + "learning_rate": 0.00010281124497991968, + "loss": 2.555, + "step": 364 + }, + { + "epoch": 1.465863453815261, + "grad_norm": 2.8447391986846924, + "learning_rate": 0.00010254350736278449, + "loss": 2.6998, + "step": 365 + }, + { + "epoch": 1.4698795180722892, + "grad_norm": 2.6170618534088135, + "learning_rate": 0.00010227576974564928, + "loss": 2.7688, + "step": 366 + }, + { + "epoch": 1.4738955823293174, + "grad_norm": 2.933560609817505, + "learning_rate": 0.00010200803212851406, + "loss": 3.0291, + "step": 367 + }, + { + "epoch": 1.4779116465863453, + "grad_norm": 2.6285972595214844, + "learning_rate": 0.00010174029451137885, + "loss": 2.8629, + "step": 368 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 3.2716546058654785, + "learning_rate": 0.00010147255689424364, + "loss": 3.1994, + "step": 369 + }, + { + "epoch": 1.4859437751004017, + "grad_norm": 2.758296489715576, + "learning_rate": 0.00010120481927710844, + "loss": 2.6734, + "step": 370 + }, + { + "epoch": 1.4899598393574296, + "grad_norm": 2.3439807891845703, + "learning_rate": 0.00010093708165997322, + "loss": 2.8747, + "step": 371 + }, + { + "epoch": 1.4939759036144578, + "grad_norm": 2.4199349880218506, + "learning_rate": 0.00010066934404283804, + "loss": 2.7135, + "step": 372 + }, + { + "epoch": 1.497991967871486, + "grad_norm": 2.8863987922668457, + "learning_rate": 0.00010040160642570282, + "loss": 3.3239, + "step": 373 + }, + { + "epoch": 1.502008032128514, + "grad_norm": 2.5620765686035156, + "learning_rate": 0.00010013386880856761, + "loss": 2.5748, + "step": 374 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 2.5705456733703613, + "learning_rate": 9.98661311914324e-05, + "loss": 3.4645, + "step": 375 + }, + { + "epoch": 1.5100401606425704, + "grad_norm": 2.75276780128479, + "learning_rate": 9.95983935742972e-05, + "loss": 2.7345, + "step": 376 + }, + { + "epoch": 1.5140562248995983, + "grad_norm": 2.5206143856048584, + "learning_rate": 9.933065595716199e-05, + "loss": 2.8325, + "step": 377 + }, + { + "epoch": 1.5180722891566265, + "grad_norm": 2.3054890632629395, + "learning_rate": 9.906291834002678e-05, + "loss": 2.884, + "step": 378 + }, + { + "epoch": 1.5220883534136547, + "grad_norm": 2.563084125518799, + "learning_rate": 9.879518072289157e-05, + "loss": 3.0262, + "step": 379 + }, + { + "epoch": 1.5261044176706826, + "grad_norm": 2.575040817260742, + "learning_rate": 9.852744310575637e-05, + "loss": 3.096, + "step": 380 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 2.3715319633483887, + "learning_rate": 9.825970548862116e-05, + "loss": 2.7163, + "step": 381 + }, + { + "epoch": 1.534136546184739, + "grad_norm": 2.7323389053344727, + "learning_rate": 9.799196787148595e-05, + "loss": 2.792, + "step": 382 + }, + { + "epoch": 1.538152610441767, + "grad_norm": 2.523524522781372, + "learning_rate": 9.772423025435074e-05, + "loss": 3.2821, + "step": 383 + }, + { + "epoch": 1.5421686746987953, + "grad_norm": 2.533090114593506, + "learning_rate": 9.745649263721554e-05, + "loss": 2.7672, + "step": 384 + }, + { + "epoch": 1.5461847389558234, + "grad_norm": 2.644031286239624, + "learning_rate": 9.718875502008033e-05, + "loss": 3.0318, + "step": 385 + }, + { + "epoch": 1.5502008032128514, + "grad_norm": 3.1442739963531494, + "learning_rate": 9.692101740294511e-05, + "loss": 3.6628, + "step": 386 + }, + { + "epoch": 1.5542168674698795, + "grad_norm": 2.403552532196045, + "learning_rate": 9.665327978580992e-05, + "loss": 2.4332, + "step": 387 + }, + { + "epoch": 1.5582329317269075, + "grad_norm": 2.478534698486328, + "learning_rate": 9.638554216867471e-05, + "loss": 2.4746, + "step": 388 + }, + { + "epoch": 1.5622489959839356, + "grad_norm": 2.7873339653015137, + "learning_rate": 9.611780455153949e-05, + "loss": 2.8514, + "step": 389 + }, + { + "epoch": 1.5662650602409638, + "grad_norm": 2.751532793045044, + "learning_rate": 9.58500669344043e-05, + "loss": 2.9365, + "step": 390 + }, + { + "epoch": 1.570281124497992, + "grad_norm": 2.8862998485565186, + "learning_rate": 9.558232931726909e-05, + "loss": 3.2632, + "step": 391 + }, + { + "epoch": 1.5742971887550201, + "grad_norm": 2.5372817516326904, + "learning_rate": 9.531459170013387e-05, + "loss": 2.8649, + "step": 392 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 2.428025007247925, + "learning_rate": 9.504685408299867e-05, + "loss": 2.6417, + "step": 393 + }, + { + "epoch": 1.5823293172690764, + "grad_norm": 3.284771680831909, + "learning_rate": 9.477911646586346e-05, + "loss": 3.4804, + "step": 394 + }, + { + "epoch": 1.5863453815261044, + "grad_norm": 2.8651950359344482, + "learning_rate": 9.451137884872824e-05, + "loss": 3.1454, + "step": 395 + }, + { + "epoch": 1.5903614457831325, + "grad_norm": 3.078660011291504, + "learning_rate": 9.424364123159304e-05, + "loss": 3.5961, + "step": 396 + }, + { + "epoch": 1.5943775100401605, + "grad_norm": 2.2207376956939697, + "learning_rate": 9.397590361445784e-05, + "loss": 2.3121, + "step": 397 + }, + { + "epoch": 1.5983935742971886, + "grad_norm": 2.4094178676605225, + "learning_rate": 9.370816599732262e-05, + "loss": 2.7138, + "step": 398 + }, + { + "epoch": 1.6024096385542168, + "grad_norm": 2.759876251220703, + "learning_rate": 9.344042838018742e-05, + "loss": 3.5605, + "step": 399 + }, + { + "epoch": 1.606425702811245, + "grad_norm": 2.189237117767334, + "learning_rate": 9.317269076305222e-05, + "loss": 2.6023, + "step": 400 + }, + { + "epoch": 1.6104417670682731, + "grad_norm": 2.585479736328125, + "learning_rate": 9.2904953145917e-05, + "loss": 3.2234, + "step": 401 + }, + { + "epoch": 1.6144578313253013, + "grad_norm": 2.565342664718628, + "learning_rate": 9.26372155287818e-05, + "loss": 3.0341, + "step": 402 + }, + { + "epoch": 1.6184738955823295, + "grad_norm": 2.4045302867889404, + "learning_rate": 9.23694779116466e-05, + "loss": 2.7032, + "step": 403 + }, + { + "epoch": 1.6224899598393574, + "grad_norm": 3.0136139392852783, + "learning_rate": 9.210174029451138e-05, + "loss": 3.1651, + "step": 404 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 2.253669261932373, + "learning_rate": 9.183400267737617e-05, + "loss": 2.2507, + "step": 405 + }, + { + "epoch": 1.6305220883534135, + "grad_norm": 2.734966993331909, + "learning_rate": 9.156626506024096e-05, + "loss": 3.0798, + "step": 406 + }, + { + "epoch": 1.6345381526104417, + "grad_norm": 2.955502986907959, + "learning_rate": 9.129852744310576e-05, + "loss": 3.086, + "step": 407 + }, + { + "epoch": 1.6385542168674698, + "grad_norm": 3.2345542907714844, + "learning_rate": 9.103078982597055e-05, + "loss": 3.3553, + "step": 408 + }, + { + "epoch": 1.642570281124498, + "grad_norm": 2.7762720584869385, + "learning_rate": 9.076305220883534e-05, + "loss": 3.4238, + "step": 409 + }, + { + "epoch": 1.6465863453815262, + "grad_norm": 2.824641466140747, + "learning_rate": 9.049531459170014e-05, + "loss": 2.8925, + "step": 410 + }, + { + "epoch": 1.6506024096385543, + "grad_norm": 2.754810094833374, + "learning_rate": 9.022757697456493e-05, + "loss": 2.9022, + "step": 411 + }, + { + "epoch": 1.6546184738955825, + "grad_norm": 2.5305283069610596, + "learning_rate": 8.995983935742972e-05, + "loss": 2.927, + "step": 412 + }, + { + "epoch": 1.6586345381526104, + "grad_norm": 2.796165943145752, + "learning_rate": 8.969210174029451e-05, + "loss": 2.9185, + "step": 413 + }, + { + "epoch": 1.6626506024096386, + "grad_norm": 2.9504239559173584, + "learning_rate": 8.942436412315931e-05, + "loss": 3.3915, + "step": 414 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.8904786109924316, + "learning_rate": 8.91566265060241e-05, + "loss": 2.8841, + "step": 415 + }, + { + "epoch": 1.6706827309236947, + "grad_norm": 2.184354305267334, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3859, + "step": 416 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 3.1552340984344482, + "learning_rate": 8.862115127175369e-05, + "loss": 3.22, + "step": 417 + }, + { + "epoch": 1.678714859437751, + "grad_norm": 3.2323250770568848, + "learning_rate": 8.835341365461848e-05, + "loss": 2.8859, + "step": 418 + }, + { + "epoch": 1.6827309236947792, + "grad_norm": 2.726513147354126, + "learning_rate": 8.808567603748327e-05, + "loss": 3.0969, + "step": 419 + }, + { + "epoch": 1.6867469879518073, + "grad_norm": 2.7404675483703613, + "learning_rate": 8.781793842034806e-05, + "loss": 2.76, + "step": 420 + }, + { + "epoch": 1.6907630522088355, + "grad_norm": 3.433872699737549, + "learning_rate": 8.755020080321286e-05, + "loss": 3.1852, + "step": 421 + }, + { + "epoch": 1.6947791164658634, + "grad_norm": 3.4727306365966797, + "learning_rate": 8.728246318607765e-05, + "loss": 3.6413, + "step": 422 + }, + { + "epoch": 1.6987951807228916, + "grad_norm": 2.968161106109619, + "learning_rate": 8.701472556894244e-05, + "loss": 3.15, + "step": 423 + }, + { + "epoch": 1.7028112449799195, + "grad_norm": 2.8164682388305664, + "learning_rate": 8.674698795180724e-05, + "loss": 3.0286, + "step": 424 + }, + { + "epoch": 1.7068273092369477, + "grad_norm": 2.7942745685577393, + "learning_rate": 8.647925033467203e-05, + "loss": 3.2501, + "step": 425 + }, + { + "epoch": 1.7108433734939759, + "grad_norm": 3.2419016361236572, + "learning_rate": 8.621151271753681e-05, + "loss": 4.3181, + "step": 426 + }, + { + "epoch": 1.714859437751004, + "grad_norm": 3.3823928833007812, + "learning_rate": 8.594377510040161e-05, + "loss": 3.2917, + "step": 427 + }, + { + "epoch": 1.7188755020080322, + "grad_norm": 2.8482446670532227, + "learning_rate": 8.567603748326641e-05, + "loss": 3.0338, + "step": 428 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 2.435845375061035, + "learning_rate": 8.540829986613119e-05, + "loss": 2.5519, + "step": 429 + }, + { + "epoch": 1.7269076305220885, + "grad_norm": 2.9163546562194824, + "learning_rate": 8.514056224899599e-05, + "loss": 3.72, + "step": 430 + }, + { + "epoch": 1.7309236947791165, + "grad_norm": 2.3660037517547607, + "learning_rate": 8.487282463186079e-05, + "loss": 2.3941, + "step": 431 + }, + { + "epoch": 1.7349397590361446, + "grad_norm": 2.527449131011963, + "learning_rate": 8.460508701472556e-05, + "loss": 2.9851, + "step": 432 + }, + { + "epoch": 1.7389558232931726, + "grad_norm": 2.2324576377868652, + "learning_rate": 8.433734939759037e-05, + "loss": 2.6241, + "step": 433 + }, + { + "epoch": 1.7429718875502007, + "grad_norm": 2.7165253162384033, + "learning_rate": 8.406961178045516e-05, + "loss": 2.7749, + "step": 434 + }, + { + "epoch": 1.7469879518072289, + "grad_norm": 2.7401411533355713, + "learning_rate": 8.380187416331994e-05, + "loss": 2.9022, + "step": 435 + }, + { + "epoch": 1.751004016064257, + "grad_norm": 2.518826961517334, + "learning_rate": 8.353413654618474e-05, + "loss": 2.7587, + "step": 436 + }, + { + "epoch": 1.7550200803212852, + "grad_norm": 2.493936061859131, + "learning_rate": 8.326639892904954e-05, + "loss": 3.1417, + "step": 437 + }, + { + "epoch": 1.7590361445783134, + "grad_norm": 2.747951030731201, + "learning_rate": 8.299866131191432e-05, + "loss": 2.6913, + "step": 438 + }, + { + "epoch": 1.7630522088353415, + "grad_norm": 2.8907039165496826, + "learning_rate": 8.273092369477911e-05, + "loss": 2.4416, + "step": 439 + }, + { + "epoch": 1.7670682730923695, + "grad_norm": 3.6564669609069824, + "learning_rate": 8.246318607764392e-05, + "loss": 3.9361, + "step": 440 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 2.4362285137176514, + "learning_rate": 8.21954484605087e-05, + "loss": 2.461, + "step": 441 + }, + { + "epoch": 1.7751004016064256, + "grad_norm": 3.2182202339172363, + "learning_rate": 8.192771084337349e-05, + "loss": 3.2511, + "step": 442 + }, + { + "epoch": 1.7791164658634537, + "grad_norm": 3.2106211185455322, + "learning_rate": 8.16599732262383e-05, + "loss": 4.4307, + "step": 443 + }, + { + "epoch": 1.783132530120482, + "grad_norm": 3.4369003772735596, + "learning_rate": 8.139223560910308e-05, + "loss": 4.08, + "step": 444 + }, + { + "epoch": 1.78714859437751, + "grad_norm": 2.2681970596313477, + "learning_rate": 8.112449799196787e-05, + "loss": 2.3631, + "step": 445 + }, + { + "epoch": 1.7911646586345382, + "grad_norm": 2.691133975982666, + "learning_rate": 8.085676037483266e-05, + "loss": 2.6157, + "step": 446 + }, + { + "epoch": 1.7951807228915664, + "grad_norm": 2.9200479984283447, + "learning_rate": 8.058902275769746e-05, + "loss": 2.6649, + "step": 447 + }, + { + "epoch": 1.7991967871485943, + "grad_norm": 2.787264108657837, + "learning_rate": 8.032128514056225e-05, + "loss": 2.763, + "step": 448 + }, + { + "epoch": 1.8032128514056225, + "grad_norm": 2.940075635910034, + "learning_rate": 8.005354752342704e-05, + "loss": 2.9436, + "step": 449 + }, + { + "epoch": 1.8072289156626506, + "grad_norm": 3.1111507415771484, + "learning_rate": 7.978580990629184e-05, + "loss": 3.1194, + "step": 450 + }, + { + "epoch": 1.8112449799196786, + "grad_norm": 2.695709228515625, + "learning_rate": 7.951807228915663e-05, + "loss": 2.7517, + "step": 451 + }, + { + "epoch": 1.8152610441767068, + "grad_norm": 2.939112663269043, + "learning_rate": 7.925033467202142e-05, + "loss": 3.7794, + "step": 452 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 2.583163022994995, + "learning_rate": 7.898259705488621e-05, + "loss": 3.0265, + "step": 453 + }, + { + "epoch": 1.823293172690763, + "grad_norm": 2.496131181716919, + "learning_rate": 7.8714859437751e-05, + "loss": 2.5762, + "step": 454 + }, + { + "epoch": 1.8273092369477912, + "grad_norm": 2.4272570610046387, + "learning_rate": 7.84471218206158e-05, + "loss": 2.758, + "step": 455 + }, + { + "epoch": 1.8313253012048194, + "grad_norm": 2.4154021739959717, + "learning_rate": 7.817938420348059e-05, + "loss": 2.7325, + "step": 456 + }, + { + "epoch": 1.8353413654618473, + "grad_norm": 2.5219106674194336, + "learning_rate": 7.791164658634539e-05, + "loss": 2.779, + "step": 457 + }, + { + "epoch": 1.8393574297188755, + "grad_norm": 2.3390161991119385, + "learning_rate": 7.764390896921018e-05, + "loss": 2.2922, + "step": 458 + }, + { + "epoch": 1.8433734939759037, + "grad_norm": 2.7101354598999023, + "learning_rate": 7.737617135207497e-05, + "loss": 2.9825, + "step": 459 + }, + { + "epoch": 1.8473895582329316, + "grad_norm": 2.8510243892669678, + "learning_rate": 7.710843373493976e-05, + "loss": 2.8628, + "step": 460 + }, + { + "epoch": 1.8514056224899598, + "grad_norm": 2.6924989223480225, + "learning_rate": 7.684069611780456e-05, + "loss": 2.6543, + "step": 461 + }, + { + "epoch": 1.855421686746988, + "grad_norm": 2.6552584171295166, + "learning_rate": 7.657295850066935e-05, + "loss": 3.0625, + "step": 462 + }, + { + "epoch": 1.859437751004016, + "grad_norm": 3.2962827682495117, + "learning_rate": 7.630522088353414e-05, + "loss": 3.308, + "step": 463 + }, + { + "epoch": 1.8634538152610443, + "grad_norm": 3.0845699310302734, + "learning_rate": 7.603748326639893e-05, + "loss": 3.5178, + "step": 464 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 2.768254518508911, + "learning_rate": 7.576974564926373e-05, + "loss": 3.6667, + "step": 465 + }, + { + "epoch": 1.8714859437751004, + "grad_norm": 2.5801167488098145, + "learning_rate": 7.550200803212851e-05, + "loss": 2.7686, + "step": 466 + }, + { + "epoch": 1.8755020080321285, + "grad_norm": 2.2853081226348877, + "learning_rate": 7.523427041499331e-05, + "loss": 2.2115, + "step": 467 + }, + { + "epoch": 1.8795180722891565, + "grad_norm": 2.9309747219085693, + "learning_rate": 7.49665327978581e-05, + "loss": 2.9426, + "step": 468 + }, + { + "epoch": 1.8835341365461846, + "grad_norm": 3.146700143814087, + "learning_rate": 7.469879518072289e-05, + "loss": 3.3903, + "step": 469 + }, + { + "epoch": 1.8875502008032128, + "grad_norm": 3.3652424812316895, + "learning_rate": 7.443105756358769e-05, + "loss": 3.0085, + "step": 470 + }, + { + "epoch": 1.891566265060241, + "grad_norm": 2.424377918243408, + "learning_rate": 7.416331994645248e-05, + "loss": 2.5145, + "step": 471 + }, + { + "epoch": 1.895582329317269, + "grad_norm": 2.5642752647399902, + "learning_rate": 7.389558232931726e-05, + "loss": 3.1927, + "step": 472 + }, + { + "epoch": 1.8995983935742973, + "grad_norm": 2.7574706077575684, + "learning_rate": 7.362784471218207e-05, + "loss": 2.6753, + "step": 473 + }, + { + "epoch": 1.9036144578313254, + "grad_norm": 2.6844048500061035, + "learning_rate": 7.336010709504686e-05, + "loss": 2.7126, + "step": 474 + }, + { + "epoch": 1.9076305220883534, + "grad_norm": 2.3251895904541016, + "learning_rate": 7.309236947791164e-05, + "loss": 2.5947, + "step": 475 + }, + { + "epoch": 1.9116465863453815, + "grad_norm": 2.1562206745147705, + "learning_rate": 7.282463186077644e-05, + "loss": 2.2137, + "step": 476 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 2.400747776031494, + "learning_rate": 7.255689424364124e-05, + "loss": 2.8869, + "step": 477 + }, + { + "epoch": 1.9196787148594376, + "grad_norm": 3.1380369663238525, + "learning_rate": 7.228915662650602e-05, + "loss": 3.4202, + "step": 478 + }, + { + "epoch": 1.9236947791164658, + "grad_norm": 2.9858291149139404, + "learning_rate": 7.202141900937081e-05, + "loss": 3.1519, + "step": 479 + }, + { + "epoch": 1.927710843373494, + "grad_norm": 2.6354973316192627, + "learning_rate": 7.175368139223562e-05, + "loss": 2.8662, + "step": 480 + }, + { + "epoch": 1.9317269076305221, + "grad_norm": 2.7349445819854736, + "learning_rate": 7.14859437751004e-05, + "loss": 4.2679, + "step": 481 + }, + { + "epoch": 1.9357429718875503, + "grad_norm": 3.0139505863189697, + "learning_rate": 7.121820615796519e-05, + "loss": 2.9382, + "step": 482 + }, + { + "epoch": 1.9397590361445785, + "grad_norm": 3.1879093647003174, + "learning_rate": 7.095046854083e-05, + "loss": 3.168, + "step": 483 + }, + { + "epoch": 1.9437751004016064, + "grad_norm": 3.2778398990631104, + "learning_rate": 7.068273092369478e-05, + "loss": 3.4373, + "step": 484 + }, + { + "epoch": 1.9477911646586346, + "grad_norm": 3.024111747741699, + "learning_rate": 7.041499330655957e-05, + "loss": 3.7807, + "step": 485 + }, + { + "epoch": 1.9518072289156625, + "grad_norm": 2.750593423843384, + "learning_rate": 7.014725568942436e-05, + "loss": 3.4546, + "step": 486 + }, + { + "epoch": 1.9558232931726907, + "grad_norm": 2.9757187366485596, + "learning_rate": 6.987951807228917e-05, + "loss": 3.0145, + "step": 487 + }, + { + "epoch": 1.9598393574297188, + "grad_norm": 2.867292881011963, + "learning_rate": 6.961178045515395e-05, + "loss": 2.5524, + "step": 488 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 2.563595771789551, + "learning_rate": 6.934404283801874e-05, + "loss": 2.7503, + "step": 489 + }, + { + "epoch": 1.9678714859437751, + "grad_norm": 2.52006459236145, + "learning_rate": 6.907630522088355e-05, + "loss": 3.0431, + "step": 490 + }, + { + "epoch": 1.9718875502008033, + "grad_norm": 3.0700199604034424, + "learning_rate": 6.880856760374833e-05, + "loss": 3.7242, + "step": 491 + }, + { + "epoch": 1.9759036144578315, + "grad_norm": 2.7504234313964844, + "learning_rate": 6.854082998661312e-05, + "loss": 2.6293, + "step": 492 + }, + { + "epoch": 1.9799196787148594, + "grad_norm": 2.919828414916992, + "learning_rate": 6.827309236947793e-05, + "loss": 2.6278, + "step": 493 + }, + { + "epoch": 1.9839357429718876, + "grad_norm": 2.453157663345337, + "learning_rate": 6.80053547523427e-05, + "loss": 2.2764, + "step": 494 + }, + { + "epoch": 1.9879518072289155, + "grad_norm": 2.635430335998535, + "learning_rate": 6.77376171352075e-05, + "loss": 2.9467, + "step": 495 + }, + { + "epoch": 1.9919678714859437, + "grad_norm": 2.7158102989196777, + "learning_rate": 6.746987951807229e-05, + "loss": 2.7886, + "step": 496 + }, + { + "epoch": 1.9959839357429718, + "grad_norm": 2.3272292613983154, + "learning_rate": 6.720214190093708e-05, + "loss": 2.6445, + "step": 497 + }, + { + "epoch": 2.0, + "grad_norm": 2.2954020500183105, + "learning_rate": 6.693440428380188e-05, + "loss": 2.5719, + "step": 498 + }, + { + "epoch": 2.0, + "eval_loss": 0.8565791249275208, + "eval_runtime": 200.8505, + "eval_samples_per_second": 2.484, + "eval_steps_per_second": 1.245, + "step": 498 + }, + { + "epoch": 2.004016064257028, + "grad_norm": 2.3647961616516113, + "learning_rate": 6.666666666666667e-05, + "loss": 2.5357, + "step": 499 + }, + { + "epoch": 2.0080321285140563, + "grad_norm": 2.052393674850464, + "learning_rate": 6.639892904953146e-05, + "loss": 2.1653, + "step": 500 + }, + { + "epoch": 2.0120481927710845, + "grad_norm": 2.6393344402313232, + "learning_rate": 6.613119143239626e-05, + "loss": 2.2634, + "step": 501 + }, + { + "epoch": 2.0160642570281126, + "grad_norm": 2.4461183547973633, + "learning_rate": 6.586345381526105e-05, + "loss": 2.7017, + "step": 502 + }, + { + "epoch": 2.0200803212851404, + "grad_norm": 3.1604115962982178, + "learning_rate": 6.559571619812584e-05, + "loss": 3.6735, + "step": 503 + }, + { + "epoch": 2.0240963855421685, + "grad_norm": 3.0627472400665283, + "learning_rate": 6.532797858099063e-05, + "loss": 2.9889, + "step": 504 + }, + { + "epoch": 2.0281124497991967, + "grad_norm": 2.568150520324707, + "learning_rate": 6.506024096385543e-05, + "loss": 2.492, + "step": 505 + }, + { + "epoch": 2.032128514056225, + "grad_norm": 2.2594618797302246, + "learning_rate": 6.47925033467202e-05, + "loss": 1.8152, + "step": 506 + }, + { + "epoch": 2.036144578313253, + "grad_norm": 2.544188976287842, + "learning_rate": 6.452476572958501e-05, + "loss": 3.7016, + "step": 507 + }, + { + "epoch": 2.040160642570281, + "grad_norm": 2.418565511703491, + "learning_rate": 6.42570281124498e-05, + "loss": 2.3062, + "step": 508 + }, + { + "epoch": 2.0441767068273093, + "grad_norm": 2.3617923259735107, + "learning_rate": 6.398929049531458e-05, + "loss": 2.2887, + "step": 509 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 2.4115524291992188, + "learning_rate": 6.372155287817939e-05, + "loss": 2.4596, + "step": 510 + }, + { + "epoch": 2.0522088353413657, + "grad_norm": 2.763218402862549, + "learning_rate": 6.345381526104418e-05, + "loss": 2.7423, + "step": 511 + }, + { + "epoch": 2.0562248995983934, + "grad_norm": 2.515378713607788, + "learning_rate": 6.318607764390896e-05, + "loss": 2.4356, + "step": 512 + }, + { + "epoch": 2.0602409638554215, + "grad_norm": 2.809786796569824, + "learning_rate": 6.291834002677377e-05, + "loss": 3.3361, + "step": 513 + }, + { + "epoch": 2.0642570281124497, + "grad_norm": 2.3717005252838135, + "learning_rate": 6.265060240963856e-05, + "loss": 3.0205, + "step": 514 + }, + { + "epoch": 2.068273092369478, + "grad_norm": 2.7689290046691895, + "learning_rate": 6.238286479250335e-05, + "loss": 2.9104, + "step": 515 + }, + { + "epoch": 2.072289156626506, + "grad_norm": 2.573058843612671, + "learning_rate": 6.211512717536813e-05, + "loss": 2.2966, + "step": 516 + }, + { + "epoch": 2.076305220883534, + "grad_norm": 2.5662682056427, + "learning_rate": 6.184738955823294e-05, + "loss": 2.4407, + "step": 517 + }, + { + "epoch": 2.0803212851405624, + "grad_norm": 2.475853681564331, + "learning_rate": 6.157965194109773e-05, + "loss": 2.2512, + "step": 518 + }, + { + "epoch": 2.0843373493975905, + "grad_norm": 2.426939010620117, + "learning_rate": 6.131191432396251e-05, + "loss": 2.2575, + "step": 519 + }, + { + "epoch": 2.0883534136546187, + "grad_norm": 2.709951877593994, + "learning_rate": 6.104417670682732e-05, + "loss": 2.2289, + "step": 520 + }, + { + "epoch": 2.0923694779116464, + "grad_norm": 2.620199680328369, + "learning_rate": 6.0776439089692105e-05, + "loss": 2.6856, + "step": 521 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 2.236469030380249, + "learning_rate": 6.05087014725569e-05, + "loss": 2.1652, + "step": 522 + }, + { + "epoch": 2.1004016064257027, + "grad_norm": 2.4781830310821533, + "learning_rate": 6.02409638554217e-05, + "loss": 2.0519, + "step": 523 + }, + { + "epoch": 2.104417670682731, + "grad_norm": 2.9179675579071045, + "learning_rate": 5.9973226238286484e-05, + "loss": 2.3534, + "step": 524 + }, + { + "epoch": 2.108433734939759, + "grad_norm": 2.7088980674743652, + "learning_rate": 5.9705488621151276e-05, + "loss": 2.3717, + "step": 525 + }, + { + "epoch": 2.112449799196787, + "grad_norm": 2.784228801727295, + "learning_rate": 5.943775100401606e-05, + "loss": 2.7936, + "step": 526 + }, + { + "epoch": 2.1164658634538154, + "grad_norm": 3.1045587062835693, + "learning_rate": 5.917001338688086e-05, + "loss": 2.1785, + "step": 527 + }, + { + "epoch": 2.1204819277108435, + "grad_norm": 2.7609670162200928, + "learning_rate": 5.8902275769745655e-05, + "loss": 2.4232, + "step": 528 + }, + { + "epoch": 2.1244979919678713, + "grad_norm": 2.9791460037231445, + "learning_rate": 5.863453815261044e-05, + "loss": 2.6127, + "step": 529 + }, + { + "epoch": 2.1285140562248994, + "grad_norm": 2.917396306991577, + "learning_rate": 5.836680053547524e-05, + "loss": 2.5008, + "step": 530 + }, + { + "epoch": 2.1325301204819276, + "grad_norm": 3.066033124923706, + "learning_rate": 5.809906291834003e-05, + "loss": 2.8997, + "step": 531 + }, + { + "epoch": 2.1365461847389557, + "grad_norm": 2.570894241333008, + "learning_rate": 5.783132530120482e-05, + "loss": 2.2987, + "step": 532 + }, + { + "epoch": 2.140562248995984, + "grad_norm": 2.4431967735290527, + "learning_rate": 5.756358768406962e-05, + "loss": 2.1485, + "step": 533 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 2.789560079574585, + "learning_rate": 5.729585006693441e-05, + "loss": 2.3678, + "step": 534 + }, + { + "epoch": 2.1485943775100402, + "grad_norm": 2.691913366317749, + "learning_rate": 5.70281124497992e-05, + "loss": 2.3469, + "step": 535 + }, + { + "epoch": 2.1526104417670684, + "grad_norm": 2.472721815109253, + "learning_rate": 5.676037483266399e-05, + "loss": 2.0741, + "step": 536 + }, + { + "epoch": 2.1566265060240966, + "grad_norm": 2.705008029937744, + "learning_rate": 5.649263721552879e-05, + "loss": 2.3399, + "step": 537 + }, + { + "epoch": 2.1606425702811247, + "grad_norm": 2.8036177158355713, + "learning_rate": 5.6224899598393576e-05, + "loss": 2.4336, + "step": 538 + }, + { + "epoch": 2.1646586345381524, + "grad_norm": 2.8112568855285645, + "learning_rate": 5.595716198125837e-05, + "loss": 2.4039, + "step": 539 + }, + { + "epoch": 2.1686746987951806, + "grad_norm": 2.932802438735962, + "learning_rate": 5.568942436412317e-05, + "loss": 2.4175, + "step": 540 + }, + { + "epoch": 2.1726907630522088, + "grad_norm": 3.0952837467193604, + "learning_rate": 5.5421686746987955e-05, + "loss": 2.4552, + "step": 541 + }, + { + "epoch": 2.176706827309237, + "grad_norm": 2.6719419956207275, + "learning_rate": 5.515394912985275e-05, + "loss": 2.0765, + "step": 542 + }, + { + "epoch": 2.180722891566265, + "grad_norm": 3.0576534271240234, + "learning_rate": 5.488621151271755e-05, + "loss": 2.417, + "step": 543 + }, + { + "epoch": 2.1847389558232932, + "grad_norm": 3.0612807273864746, + "learning_rate": 5.461847389558233e-05, + "loss": 2.9868, + "step": 544 + }, + { + "epoch": 2.1887550200803214, + "grad_norm": 3.5036559104919434, + "learning_rate": 5.4350736278447126e-05, + "loss": 2.7975, + "step": 545 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 3.5645198822021484, + "learning_rate": 5.408299866131191e-05, + "loss": 2.8446, + "step": 546 + }, + { + "epoch": 2.1967871485943773, + "grad_norm": 2.72088360786438, + "learning_rate": 5.381526104417671e-05, + "loss": 2.3907, + "step": 547 + }, + { + "epoch": 2.2008032128514055, + "grad_norm": 3.901146411895752, + "learning_rate": 5.3547523427041504e-05, + "loss": 3.4091, + "step": 548 + }, + { + "epoch": 2.2048192771084336, + "grad_norm": 2.9762930870056152, + "learning_rate": 5.327978580990629e-05, + "loss": 2.2808, + "step": 549 + }, + { + "epoch": 2.208835341365462, + "grad_norm": 3.1252336502075195, + "learning_rate": 5.301204819277109e-05, + "loss": 2.3206, + "step": 550 + }, + { + "epoch": 2.21285140562249, + "grad_norm": 3.61395525932312, + "learning_rate": 5.274431057563588e-05, + "loss": 2.9899, + "step": 551 + }, + { + "epoch": 2.216867469879518, + "grad_norm": 3.035787582397461, + "learning_rate": 5.247657295850067e-05, + "loss": 2.2514, + "step": 552 + }, + { + "epoch": 2.2208835341365463, + "grad_norm": 3.0700008869171143, + "learning_rate": 5.220883534136547e-05, + "loss": 2.7965, + "step": 553 + }, + { + "epoch": 2.2248995983935744, + "grad_norm": 3.380383253097534, + "learning_rate": 5.194109772423026e-05, + "loss": 2.7258, + "step": 554 + }, + { + "epoch": 2.2289156626506026, + "grad_norm": 3.3445475101470947, + "learning_rate": 5.167336010709505e-05, + "loss": 3.0532, + "step": 555 + }, + { + "epoch": 2.2329317269076308, + "grad_norm": 3.305169105529785, + "learning_rate": 5.140562248995984e-05, + "loss": 2.7851, + "step": 556 + }, + { + "epoch": 2.2369477911646585, + "grad_norm": 3.3952481746673584, + "learning_rate": 5.113788487282464e-05, + "loss": 2.6845, + "step": 557 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 2.7673559188842773, + "learning_rate": 5.0870147255689426e-05, + "loss": 2.6067, + "step": 558 + }, + { + "epoch": 2.244979919678715, + "grad_norm": 3.3448803424835205, + "learning_rate": 5.060240963855422e-05, + "loss": 2.4804, + "step": 559 + }, + { + "epoch": 2.248995983935743, + "grad_norm": 2.797827959060669, + "learning_rate": 5.033467202141902e-05, + "loss": 2.1237, + "step": 560 + }, + { + "epoch": 2.253012048192771, + "grad_norm": 2.9383599758148193, + "learning_rate": 5.0066934404283804e-05, + "loss": 2.3107, + "step": 561 + }, + { + "epoch": 2.2570281124497993, + "grad_norm": 3.0028162002563477, + "learning_rate": 4.97991967871486e-05, + "loss": 3.2211, + "step": 562 + }, + { + "epoch": 2.2610441767068274, + "grad_norm": 2.928341865539551, + "learning_rate": 4.953145917001339e-05, + "loss": 2.5173, + "step": 563 + }, + { + "epoch": 2.2650602409638556, + "grad_norm": 2.9720232486724854, + "learning_rate": 4.926372155287818e-05, + "loss": 2.3146, + "step": 564 + }, + { + "epoch": 2.2690763052208833, + "grad_norm": 3.558094024658203, + "learning_rate": 4.8995983935742975e-05, + "loss": 3.1953, + "step": 565 + }, + { + "epoch": 2.2730923694779115, + "grad_norm": 3.0352494716644287, + "learning_rate": 4.872824631860777e-05, + "loss": 2.4965, + "step": 566 + }, + { + "epoch": 2.2771084337349397, + "grad_norm": 2.7428176403045654, + "learning_rate": 4.8460508701472554e-05, + "loss": 2.1514, + "step": 567 + }, + { + "epoch": 2.281124497991968, + "grad_norm": 2.3594534397125244, + "learning_rate": 4.8192771084337354e-05, + "loss": 1.8075, + "step": 568 + }, + { + "epoch": 2.285140562248996, + "grad_norm": 3.3449742794036865, + "learning_rate": 4.792503346720215e-05, + "loss": 2.5945, + "step": 569 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 3.104633331298828, + "learning_rate": 4.765729585006693e-05, + "loss": 2.9666, + "step": 570 + }, + { + "epoch": 2.2931726907630523, + "grad_norm": 3.094238758087158, + "learning_rate": 4.738955823293173e-05, + "loss": 2.489, + "step": 571 + }, + { + "epoch": 2.2971887550200805, + "grad_norm": 3.381775379180908, + "learning_rate": 4.712182061579652e-05, + "loss": 2.9042, + "step": 572 + }, + { + "epoch": 2.3012048192771086, + "grad_norm": 3.2117156982421875, + "learning_rate": 4.685408299866131e-05, + "loss": 2.6925, + "step": 573 + }, + { + "epoch": 2.305220883534137, + "grad_norm": 2.8267903327941895, + "learning_rate": 4.658634538152611e-05, + "loss": 2.3816, + "step": 574 + }, + { + "epoch": 2.3092369477911645, + "grad_norm": 3.068437099456787, + "learning_rate": 4.63186077643909e-05, + "loss": 2.3124, + "step": 575 + }, + { + "epoch": 2.3132530120481927, + "grad_norm": 2.832303762435913, + "learning_rate": 4.605087014725569e-05, + "loss": 2.5169, + "step": 576 + }, + { + "epoch": 2.317269076305221, + "grad_norm": 2.8893704414367676, + "learning_rate": 4.578313253012048e-05, + "loss": 2.3119, + "step": 577 + }, + { + "epoch": 2.321285140562249, + "grad_norm": 2.952976703643799, + "learning_rate": 4.5515394912985275e-05, + "loss": 2.3063, + "step": 578 + }, + { + "epoch": 2.325301204819277, + "grad_norm": 2.7303566932678223, + "learning_rate": 4.524765729585007e-05, + "loss": 2.5834, + "step": 579 + }, + { + "epoch": 2.3293172690763053, + "grad_norm": 2.9680216312408447, + "learning_rate": 4.497991967871486e-05, + "loss": 2.249, + "step": 580 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.997044324874878, + "learning_rate": 4.4712182061579654e-05, + "loss": 2.5954, + "step": 581 + }, + { + "epoch": 2.337349397590361, + "grad_norm": 3.4494729042053223, + "learning_rate": 4.4444444444444447e-05, + "loss": 3.1359, + "step": 582 + }, + { + "epoch": 2.3413654618473894, + "grad_norm": 3.1353585720062256, + "learning_rate": 4.417670682730924e-05, + "loss": 2.4317, + "step": 583 + }, + { + "epoch": 2.3453815261044175, + "grad_norm": 2.9816396236419678, + "learning_rate": 4.390896921017403e-05, + "loss": 2.8438, + "step": 584 + }, + { + "epoch": 2.3493975903614457, + "grad_norm": 2.6249794960021973, + "learning_rate": 4.3641231593038825e-05, + "loss": 2.0497, + "step": 585 + }, + { + "epoch": 2.353413654618474, + "grad_norm": 2.8994345664978027, + "learning_rate": 4.337349397590362e-05, + "loss": 2.149, + "step": 586 + }, + { + "epoch": 2.357429718875502, + "grad_norm": 3.8927950859069824, + "learning_rate": 4.3105756358768404e-05, + "loss": 3.0218, + "step": 587 + }, + { + "epoch": 2.36144578313253, + "grad_norm": 3.120274543762207, + "learning_rate": 4.2838018741633203e-05, + "loss": 2.1973, + "step": 588 + }, + { + "epoch": 2.3654618473895583, + "grad_norm": 3.104851007461548, + "learning_rate": 4.2570281124497996e-05, + "loss": 2.3442, + "step": 589 + }, + { + "epoch": 2.3694779116465865, + "grad_norm": 2.97161602973938, + "learning_rate": 4.230254350736278e-05, + "loss": 2.5706, + "step": 590 + }, + { + "epoch": 2.3734939759036147, + "grad_norm": 2.6856470108032227, + "learning_rate": 4.203480589022758e-05, + "loss": 2.0781, + "step": 591 + }, + { + "epoch": 2.3775100401606424, + "grad_norm": 2.9654481410980225, + "learning_rate": 4.176706827309237e-05, + "loss": 2.2495, + "step": 592 + }, + { + "epoch": 2.3815261044176705, + "grad_norm": 2.861020088195801, + "learning_rate": 4.149933065595716e-05, + "loss": 1.9942, + "step": 593 + }, + { + "epoch": 2.3855421686746987, + "grad_norm": 3.413158893585205, + "learning_rate": 4.123159303882196e-05, + "loss": 2.6585, + "step": 594 + }, + { + "epoch": 2.389558232931727, + "grad_norm": 3.1313233375549316, + "learning_rate": 4.0963855421686746e-05, + "loss": 2.9493, + "step": 595 + }, + { + "epoch": 2.393574297188755, + "grad_norm": 3.325638771057129, + "learning_rate": 4.069611780455154e-05, + "loss": 2.7101, + "step": 596 + }, + { + "epoch": 2.397590361445783, + "grad_norm": 2.991661787033081, + "learning_rate": 4.042838018741633e-05, + "loss": 2.5683, + "step": 597 + }, + { + "epoch": 2.4016064257028114, + "grad_norm": 3.0619921684265137, + "learning_rate": 4.0160642570281125e-05, + "loss": 2.5722, + "step": 598 + }, + { + "epoch": 2.4056224899598395, + "grad_norm": 2.730375289916992, + "learning_rate": 3.989290495314592e-05, + "loss": 2.2107, + "step": 599 + }, + { + "epoch": 2.4096385542168672, + "grad_norm": 2.5859103202819824, + "learning_rate": 3.962516733601071e-05, + "loss": 2.0576, + "step": 600 + }, + { + "epoch": 2.4136546184738954, + "grad_norm": 2.8956499099731445, + "learning_rate": 3.93574297188755e-05, + "loss": 2.1889, + "step": 601 + }, + { + "epoch": 2.4176706827309236, + "grad_norm": 2.575547933578491, + "learning_rate": 3.9089692101740296e-05, + "loss": 1.9322, + "step": 602 + }, + { + "epoch": 2.4216867469879517, + "grad_norm": 3.3304378986358643, + "learning_rate": 3.882195448460509e-05, + "loss": 2.4677, + "step": 603 + }, + { + "epoch": 2.42570281124498, + "grad_norm": 3.5554420948028564, + "learning_rate": 3.855421686746988e-05, + "loss": 2.6703, + "step": 604 + }, + { + "epoch": 2.429718875502008, + "grad_norm": 3.415844440460205, + "learning_rate": 3.8286479250334675e-05, + "loss": 2.9157, + "step": 605 + }, + { + "epoch": 2.433734939759036, + "grad_norm": 3.127218008041382, + "learning_rate": 3.801874163319947e-05, + "loss": 2.416, + "step": 606 + }, + { + "epoch": 2.4377510040160644, + "grad_norm": 3.796701192855835, + "learning_rate": 3.7751004016064253e-05, + "loss": 2.3505, + "step": 607 + }, + { + "epoch": 2.4417670682730925, + "grad_norm": 3.6044912338256836, + "learning_rate": 3.748326639892905e-05, + "loss": 2.8561, + "step": 608 + }, + { + "epoch": 2.4457831325301207, + "grad_norm": 3.2551517486572266, + "learning_rate": 3.7215528781793846e-05, + "loss": 2.5376, + "step": 609 + }, + { + "epoch": 2.4497991967871484, + "grad_norm": 2.890302896499634, + "learning_rate": 3.694779116465863e-05, + "loss": 2.2256, + "step": 610 + }, + { + "epoch": 2.4538152610441766, + "grad_norm": 3.478085517883301, + "learning_rate": 3.668005354752343e-05, + "loss": 2.6602, + "step": 611 + }, + { + "epoch": 2.4578313253012047, + "grad_norm": 3.682518720626831, + "learning_rate": 3.641231593038822e-05, + "loss": 2.8083, + "step": 612 + }, + { + "epoch": 2.461847389558233, + "grad_norm": 2.841364860534668, + "learning_rate": 3.614457831325301e-05, + "loss": 2.0827, + "step": 613 + }, + { + "epoch": 2.465863453815261, + "grad_norm": 2.784315347671509, + "learning_rate": 3.587684069611781e-05, + "loss": 3.9997, + "step": 614 + }, + { + "epoch": 2.4698795180722892, + "grad_norm": 3.153395652770996, + "learning_rate": 3.5609103078982596e-05, + "loss": 2.3443, + "step": 615 + }, + { + "epoch": 2.4738955823293174, + "grad_norm": 3.2817304134368896, + "learning_rate": 3.534136546184739e-05, + "loss": 2.6729, + "step": 616 + }, + { + "epoch": 2.4779116465863456, + "grad_norm": 2.8291358947753906, + "learning_rate": 3.507362784471218e-05, + "loss": 2.1918, + "step": 617 + }, + { + "epoch": 2.4819277108433733, + "grad_norm": 3.548492670059204, + "learning_rate": 3.4805890227576974e-05, + "loss": 3.5277, + "step": 618 + }, + { + "epoch": 2.4859437751004014, + "grad_norm": 9.622389793395996, + "learning_rate": 3.4538152610441774e-05, + "loss": 3.3926, + "step": 619 + }, + { + "epoch": 2.4899598393574296, + "grad_norm": 3.489105224609375, + "learning_rate": 3.427041499330656e-05, + "loss": 2.5828, + "step": 620 + }, + { + "epoch": 2.4939759036144578, + "grad_norm": 2.7694857120513916, + "learning_rate": 3.400267737617135e-05, + "loss": 1.9917, + "step": 621 + }, + { + "epoch": 2.497991967871486, + "grad_norm": 3.2993392944335938, + "learning_rate": 3.3734939759036146e-05, + "loss": 2.8177, + "step": 622 + }, + { + "epoch": 2.502008032128514, + "grad_norm": 2.863051176071167, + "learning_rate": 3.346720214190094e-05, + "loss": 2.0999, + "step": 623 + }, + { + "epoch": 2.5060240963855422, + "grad_norm": 3.025731086730957, + "learning_rate": 3.319946452476573e-05, + "loss": 2.555, + "step": 624 + }, + { + "epoch": 2.5100401606425704, + "grad_norm": 3.236588716506958, + "learning_rate": 3.2931726907630524e-05, + "loss": 2.3746, + "step": 625 + }, + { + "epoch": 2.5140562248995986, + "grad_norm": 3.071715831756592, + "learning_rate": 3.266398929049532e-05, + "loss": 2.1943, + "step": 626 + }, + { + "epoch": 2.5180722891566267, + "grad_norm": 3.353304147720337, + "learning_rate": 3.23962516733601e-05, + "loss": 3.2267, + "step": 627 + }, + { + "epoch": 2.522088353413655, + "grad_norm": 2.9166722297668457, + "learning_rate": 3.21285140562249e-05, + "loss": 2.5768, + "step": 628 + }, + { + "epoch": 2.5261044176706826, + "grad_norm": 2.571737051010132, + "learning_rate": 3.1860776439089695e-05, + "loss": 2.4097, + "step": 629 + }, + { + "epoch": 2.5301204819277108, + "grad_norm": 3.2051124572753906, + "learning_rate": 3.159303882195448e-05, + "loss": 2.6875, + "step": 630 + }, + { + "epoch": 2.534136546184739, + "grad_norm": 3.414586067199707, + "learning_rate": 3.132530120481928e-05, + "loss": 2.467, + "step": 631 + }, + { + "epoch": 2.538152610441767, + "grad_norm": 3.201895236968994, + "learning_rate": 3.105756358768407e-05, + "loss": 2.6332, + "step": 632 + }, + { + "epoch": 2.5421686746987953, + "grad_norm": 3.2875518798828125, + "learning_rate": 3.078982597054887e-05, + "loss": 3.0367, + "step": 633 + }, + { + "epoch": 2.5461847389558234, + "grad_norm": 2.6989524364471436, + "learning_rate": 3.052208835341366e-05, + "loss": 2.1665, + "step": 634 + }, + { + "epoch": 2.550200803212851, + "grad_norm": 2.7747488021850586, + "learning_rate": 3.025435073627845e-05, + "loss": 2.1499, + "step": 635 + }, + { + "epoch": 2.5542168674698793, + "grad_norm": 3.4082605838775635, + "learning_rate": 2.9986613119143242e-05, + "loss": 2.6462, + "step": 636 + }, + { + "epoch": 2.5582329317269075, + "grad_norm": 2.713757276535034, + "learning_rate": 2.971887550200803e-05, + "loss": 2.09, + "step": 637 + }, + { + "epoch": 2.5622489959839356, + "grad_norm": 3.2788338661193848, + "learning_rate": 2.9451137884872827e-05, + "loss": 2.3322, + "step": 638 + }, + { + "epoch": 2.566265060240964, + "grad_norm": 2.6642184257507324, + "learning_rate": 2.918340026773762e-05, + "loss": 2.1751, + "step": 639 + }, + { + "epoch": 2.570281124497992, + "grad_norm": 3.069793224334717, + "learning_rate": 2.891566265060241e-05, + "loss": 2.2499, + "step": 640 + }, + { + "epoch": 2.57429718875502, + "grad_norm": 3.132709503173828, + "learning_rate": 2.8647925033467206e-05, + "loss": 2.585, + "step": 641 + }, + { + "epoch": 2.5783132530120483, + "grad_norm": 3.27109432220459, + "learning_rate": 2.8380187416331995e-05, + "loss": 2.4458, + "step": 642 + }, + { + "epoch": 2.5823293172690764, + "grad_norm": 3.5450148582458496, + "learning_rate": 2.8112449799196788e-05, + "loss": 3.8692, + "step": 643 + }, + { + "epoch": 2.5863453815261046, + "grad_norm": 3.2768943309783936, + "learning_rate": 2.7844712182061584e-05, + "loss": 2.4152, + "step": 644 + }, + { + "epoch": 2.5903614457831328, + "grad_norm": 3.1916306018829346, + "learning_rate": 2.7576974564926374e-05, + "loss": 2.5376, + "step": 645 + }, + { + "epoch": 2.5943775100401605, + "grad_norm": 2.7519237995147705, + "learning_rate": 2.7309236947791167e-05, + "loss": 2.1762, + "step": 646 + }, + { + "epoch": 2.5983935742971886, + "grad_norm": 3.649415969848633, + "learning_rate": 2.7041499330655956e-05, + "loss": 3.0767, + "step": 647 + }, + { + "epoch": 2.602409638554217, + "grad_norm": 3.1575088500976562, + "learning_rate": 2.6773761713520752e-05, + "loss": 2.5746, + "step": 648 + }, + { + "epoch": 2.606425702811245, + "grad_norm": 3.1661970615386963, + "learning_rate": 2.6506024096385545e-05, + "loss": 2.8486, + "step": 649 + }, + { + "epoch": 2.610441767068273, + "grad_norm": 3.374446392059326, + "learning_rate": 2.6238286479250334e-05, + "loss": 3.0536, + "step": 650 + }, + { + "epoch": 2.6144578313253013, + "grad_norm": 3.2961578369140625, + "learning_rate": 2.597054886211513e-05, + "loss": 2.403, + "step": 651 + }, + { + "epoch": 2.6184738955823295, + "grad_norm": 3.078670024871826, + "learning_rate": 2.570281124497992e-05, + "loss": 2.0923, + "step": 652 + }, + { + "epoch": 2.622489959839357, + "grad_norm": 3.625155448913574, + "learning_rate": 2.5435073627844713e-05, + "loss": 3.3948, + "step": 653 + }, + { + "epoch": 2.6265060240963853, + "grad_norm": 3.2434301376342773, + "learning_rate": 2.516733601070951e-05, + "loss": 3.0131, + "step": 654 + }, + { + "epoch": 2.6305220883534135, + "grad_norm": 3.321974515914917, + "learning_rate": 2.48995983935743e-05, + "loss": 2.5972, + "step": 655 + }, + { + "epoch": 2.6345381526104417, + "grad_norm": 2.6846182346343994, + "learning_rate": 2.463186077643909e-05, + "loss": 2.2812, + "step": 656 + }, + { + "epoch": 2.63855421686747, + "grad_norm": 2.814183235168457, + "learning_rate": 2.4364123159303884e-05, + "loss": 2.1195, + "step": 657 + }, + { + "epoch": 2.642570281124498, + "grad_norm": 2.640397310256958, + "learning_rate": 2.4096385542168677e-05, + "loss": 2.1728, + "step": 658 + }, + { + "epoch": 2.646586345381526, + "grad_norm": 3.7056844234466553, + "learning_rate": 2.3828647925033466e-05, + "loss": 2.8224, + "step": 659 + }, + { + "epoch": 2.6506024096385543, + "grad_norm": 2.740823268890381, + "learning_rate": 2.356091030789826e-05, + "loss": 2.3886, + "step": 660 + }, + { + "epoch": 2.6546184738955825, + "grad_norm": 2.689279079437256, + "learning_rate": 2.3293172690763055e-05, + "loss": 2.3151, + "step": 661 + }, + { + "epoch": 2.6586345381526106, + "grad_norm": 3.4579248428344727, + "learning_rate": 2.3025435073627845e-05, + "loss": 2.7812, + "step": 662 + }, + { + "epoch": 2.662650602409639, + "grad_norm": 3.293381690979004, + "learning_rate": 2.2757697456492638e-05, + "loss": 2.9381, + "step": 663 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 3.3860654830932617, + "learning_rate": 2.248995983935743e-05, + "loss": 2.4111, + "step": 664 + }, + { + "epoch": 2.6706827309236947, + "grad_norm": 3.3504996299743652, + "learning_rate": 2.2222222222222223e-05, + "loss": 2.4411, + "step": 665 + }, + { + "epoch": 2.674698795180723, + "grad_norm": 3.2323498725891113, + "learning_rate": 2.1954484605087016e-05, + "loss": 2.6294, + "step": 666 + }, + { + "epoch": 2.678714859437751, + "grad_norm": 2.935426950454712, + "learning_rate": 2.168674698795181e-05, + "loss": 2.5489, + "step": 667 + }, + { + "epoch": 2.682730923694779, + "grad_norm": 3.483436346054077, + "learning_rate": 2.1419009370816602e-05, + "loss": 2.7512, + "step": 668 + }, + { + "epoch": 2.6867469879518073, + "grad_norm": 3.4001944065093994, + "learning_rate": 2.115127175368139e-05, + "loss": 2.4015, + "step": 669 + }, + { + "epoch": 2.6907630522088355, + "grad_norm": 3.6413683891296387, + "learning_rate": 2.0883534136546184e-05, + "loss": 3.5122, + "step": 670 + }, + { + "epoch": 2.694779116465863, + "grad_norm": 2.5411088466644287, + "learning_rate": 2.061579651941098e-05, + "loss": 2.0925, + "step": 671 + }, + { + "epoch": 2.6987951807228914, + "grad_norm": 3.1367125511169434, + "learning_rate": 2.034805890227577e-05, + "loss": 2.5457, + "step": 672 + }, + { + "epoch": 2.7028112449799195, + "grad_norm": 3.300114393234253, + "learning_rate": 2.0080321285140562e-05, + "loss": 3.0402, + "step": 673 + }, + { + "epoch": 2.7068273092369477, + "grad_norm": 2.744513750076294, + "learning_rate": 1.9812583668005355e-05, + "loss": 2.2273, + "step": 674 + }, + { + "epoch": 2.710843373493976, + "grad_norm": 3.0049889087677, + "learning_rate": 1.9544846050870148e-05, + "loss": 2.4656, + "step": 675 + }, + { + "epoch": 2.714859437751004, + "grad_norm": 2.9064860343933105, + "learning_rate": 1.927710843373494e-05, + "loss": 2.3855, + "step": 676 + }, + { + "epoch": 2.718875502008032, + "grad_norm": 3.317073106765747, + "learning_rate": 1.9009370816599734e-05, + "loss": 2.7036, + "step": 677 + }, + { + "epoch": 2.7228915662650603, + "grad_norm": 3.580209732055664, + "learning_rate": 1.8741633199464527e-05, + "loss": 2.4416, + "step": 678 + }, + { + "epoch": 2.7269076305220885, + "grad_norm": 3.0195388793945312, + "learning_rate": 1.8473895582329316e-05, + "loss": 2.0284, + "step": 679 + }, + { + "epoch": 2.7309236947791167, + "grad_norm": 3.5155584812164307, + "learning_rate": 1.820615796519411e-05, + "loss": 3.6898, + "step": 680 + }, + { + "epoch": 2.734939759036145, + "grad_norm": 3.3643851280212402, + "learning_rate": 1.7938420348058905e-05, + "loss": 2.7534, + "step": 681 + }, + { + "epoch": 2.7389558232931726, + "grad_norm": 3.949350595474243, + "learning_rate": 1.7670682730923694e-05, + "loss": 3.6933, + "step": 682 + }, + { + "epoch": 2.7429718875502007, + "grad_norm": 2.7811617851257324, + "learning_rate": 1.7402945113788487e-05, + "loss": 2.0857, + "step": 683 + }, + { + "epoch": 2.746987951807229, + "grad_norm": 3.3071796894073486, + "learning_rate": 1.713520749665328e-05, + "loss": 2.9454, + "step": 684 + }, + { + "epoch": 2.751004016064257, + "grad_norm": 3.181541919708252, + "learning_rate": 1.6867469879518073e-05, + "loss": 2.4977, + "step": 685 + }, + { + "epoch": 2.755020080321285, + "grad_norm": 2.8570432662963867, + "learning_rate": 1.6599732262382866e-05, + "loss": 2.2448, + "step": 686 + }, + { + "epoch": 2.7590361445783134, + "grad_norm": 2.8519392013549805, + "learning_rate": 1.633199464524766e-05, + "loss": 2.0659, + "step": 687 + }, + { + "epoch": 2.7630522088353415, + "grad_norm": 3.0057828426361084, + "learning_rate": 1.606425702811245e-05, + "loss": 2.711, + "step": 688 + }, + { + "epoch": 2.7670682730923692, + "grad_norm": 3.7644693851470947, + "learning_rate": 1.579651941097724e-05, + "loss": 2.7368, + "step": 689 + }, + { + "epoch": 2.7710843373493974, + "grad_norm": 3.339076519012451, + "learning_rate": 1.5528781793842034e-05, + "loss": 2.4372, + "step": 690 + }, + { + "epoch": 2.7751004016064256, + "grad_norm": 3.3303468227386475, + "learning_rate": 1.526104417670683e-05, + "loss": 2.1496, + "step": 691 + }, + { + "epoch": 2.7791164658634537, + "grad_norm": 3.007516384124756, + "learning_rate": 1.4993306559571621e-05, + "loss": 2.0637, + "step": 692 + }, + { + "epoch": 2.783132530120482, + "grad_norm": 3.2054901123046875, + "learning_rate": 1.4725568942436414e-05, + "loss": 2.6325, + "step": 693 + }, + { + "epoch": 2.78714859437751, + "grad_norm": 3.089660882949829, + "learning_rate": 1.4457831325301205e-05, + "loss": 2.6186, + "step": 694 + }, + { + "epoch": 2.791164658634538, + "grad_norm": 3.6075477600097656, + "learning_rate": 1.4190093708165998e-05, + "loss": 3.04, + "step": 695 + }, + { + "epoch": 2.7951807228915664, + "grad_norm": 2.9559810161590576, + "learning_rate": 1.3922356091030792e-05, + "loss": 2.1752, + "step": 696 + }, + { + "epoch": 2.7991967871485945, + "grad_norm": 3.062072992324829, + "learning_rate": 1.3654618473895583e-05, + "loss": 2.0509, + "step": 697 + }, + { + "epoch": 2.8032128514056227, + "grad_norm": 4.112563610076904, + "learning_rate": 1.3386880856760376e-05, + "loss": 2.937, + "step": 698 + }, + { + "epoch": 2.807228915662651, + "grad_norm": 3.2194480895996094, + "learning_rate": 1.3119143239625167e-05, + "loss": 2.2974, + "step": 699 + }, + { + "epoch": 2.8112449799196786, + "grad_norm": 3.2111270427703857, + "learning_rate": 1.285140562248996e-05, + "loss": 2.3903, + "step": 700 + }, + { + "epoch": 2.8152610441767068, + "grad_norm": 3.1619982719421387, + "learning_rate": 1.2583668005354755e-05, + "loss": 2.154, + "step": 701 + }, + { + "epoch": 2.819277108433735, + "grad_norm": 3.0533196926116943, + "learning_rate": 1.2315930388219546e-05, + "loss": 2.8862, + "step": 702 + }, + { + "epoch": 2.823293172690763, + "grad_norm": 2.838397264480591, + "learning_rate": 1.2048192771084338e-05, + "loss": 2.1974, + "step": 703 + }, + { + "epoch": 2.8273092369477912, + "grad_norm": 2.960359573364258, + "learning_rate": 1.178045515394913e-05, + "loss": 2.2714, + "step": 704 + }, + { + "epoch": 2.8313253012048194, + "grad_norm": 3.3387844562530518, + "learning_rate": 1.1512717536813922e-05, + "loss": 2.5617, + "step": 705 + }, + { + "epoch": 2.835341365461847, + "grad_norm": 3.802029609680176, + "learning_rate": 1.1244979919678715e-05, + "loss": 2.6791, + "step": 706 + }, + { + "epoch": 2.8393574297188753, + "grad_norm": 3.0797119140625, + "learning_rate": 1.0977242302543508e-05, + "loss": 2.008, + "step": 707 + }, + { + "epoch": 2.8433734939759034, + "grad_norm": 3.6929612159729004, + "learning_rate": 1.0709504685408301e-05, + "loss": 3.0253, + "step": 708 + }, + { + "epoch": 2.8473895582329316, + "grad_norm": 3.409666061401367, + "learning_rate": 1.0441767068273092e-05, + "loss": 2.488, + "step": 709 + }, + { + "epoch": 2.8514056224899598, + "grad_norm": 3.4419896602630615, + "learning_rate": 1.0174029451137885e-05, + "loss": 2.5107, + "step": 710 + }, + { + "epoch": 2.855421686746988, + "grad_norm": 2.9970462322235107, + "learning_rate": 9.906291834002678e-06, + "loss": 2.4561, + "step": 711 + }, + { + "epoch": 2.859437751004016, + "grad_norm": 2.9567370414733887, + "learning_rate": 9.63855421686747e-06, + "loss": 2.0972, + "step": 712 + }, + { + "epoch": 2.8634538152610443, + "grad_norm": 3.134462356567383, + "learning_rate": 9.370816599732263e-06, + "loss": 2.4256, + "step": 713 + }, + { + "epoch": 2.8674698795180724, + "grad_norm": 3.376096487045288, + "learning_rate": 9.103078982597054e-06, + "loss": 2.221, + "step": 714 + }, + { + "epoch": 2.8714859437751006, + "grad_norm": 3.569254159927368, + "learning_rate": 8.835341365461847e-06, + "loss": 2.379, + "step": 715 + }, + { + "epoch": 2.8755020080321287, + "grad_norm": 3.4028611183166504, + "learning_rate": 8.56760374832664e-06, + "loss": 2.3297, + "step": 716 + }, + { + "epoch": 2.8795180722891565, + "grad_norm": 3.772540807723999, + "learning_rate": 8.299866131191433e-06, + "loss": 2.9839, + "step": 717 + }, + { + "epoch": 2.8835341365461846, + "grad_norm": 3.2679340839385986, + "learning_rate": 8.032128514056226e-06, + "loss": 2.3875, + "step": 718 + }, + { + "epoch": 2.887550200803213, + "grad_norm": 3.6074769496917725, + "learning_rate": 7.764390896921017e-06, + "loss": 2.9021, + "step": 719 + }, + { + "epoch": 2.891566265060241, + "grad_norm": 3.7479116916656494, + "learning_rate": 7.4966532797858104e-06, + "loss": 2.5803, + "step": 720 + }, + { + "epoch": 2.895582329317269, + "grad_norm": 3.051452875137329, + "learning_rate": 7.228915662650602e-06, + "loss": 2.9504, + "step": 721 + }, + { + "epoch": 2.8995983935742973, + "grad_norm": 3.341724157333374, + "learning_rate": 6.961178045515396e-06, + "loss": 2.8643, + "step": 722 + }, + { + "epoch": 2.9036144578313254, + "grad_norm": 2.8065922260284424, + "learning_rate": 6.693440428380188e-06, + "loss": 2.6456, + "step": 723 + }, + { + "epoch": 2.907630522088353, + "grad_norm": 3.295828342437744, + "learning_rate": 6.42570281124498e-06, + "loss": 3.2691, + "step": 724 + }, + { + "epoch": 2.9116465863453813, + "grad_norm": 3.15494966506958, + "learning_rate": 6.157965194109773e-06, + "loss": 2.3256, + "step": 725 + }, + { + "epoch": 2.9156626506024095, + "grad_norm": 3.146188259124756, + "learning_rate": 5.890227576974565e-06, + "loss": 2.5247, + "step": 726 + }, + { + "epoch": 2.9196787148594376, + "grad_norm": 3.042181968688965, + "learning_rate": 5.622489959839358e-06, + "loss": 2.3458, + "step": 727 + }, + { + "epoch": 2.923694779116466, + "grad_norm": 2.8072509765625, + "learning_rate": 5.3547523427041504e-06, + "loss": 2.2129, + "step": 728 + }, + { + "epoch": 2.927710843373494, + "grad_norm": 3.1902520656585693, + "learning_rate": 5.087014725568942e-06, + "loss": 2.1905, + "step": 729 + }, + { + "epoch": 2.931726907630522, + "grad_norm": 3.706218719482422, + "learning_rate": 4.819277108433735e-06, + "loss": 2.8587, + "step": 730 + }, + { + "epoch": 2.9357429718875503, + "grad_norm": 3.516908645629883, + "learning_rate": 4.551539491298527e-06, + "loss": 3.0003, + "step": 731 + }, + { + "epoch": 2.9397590361445785, + "grad_norm": 3.9051806926727295, + "learning_rate": 4.28380187416332e-06, + "loss": 2.6986, + "step": 732 + }, + { + "epoch": 2.9437751004016066, + "grad_norm": 2.434493064880371, + "learning_rate": 4.016064257028113e-06, + "loss": 2.0143, + "step": 733 + }, + { + "epoch": 2.9477911646586348, + "grad_norm": 3.514988899230957, + "learning_rate": 3.7483266398929052e-06, + "loss": 2.5539, + "step": 734 + }, + { + "epoch": 2.9518072289156625, + "grad_norm": 3.145475387573242, + "learning_rate": 3.480589022757698e-06, + "loss": 2.3991, + "step": 735 + }, + { + "epoch": 2.9558232931726907, + "grad_norm": 3.0328280925750732, + "learning_rate": 3.21285140562249e-06, + "loss": 2.4384, + "step": 736 + }, + { + "epoch": 2.959839357429719, + "grad_norm": 3.584406614303589, + "learning_rate": 2.9451137884872824e-06, + "loss": 2.219, + "step": 737 + }, + { + "epoch": 2.963855421686747, + "grad_norm": 2.8902695178985596, + "learning_rate": 2.6773761713520752e-06, + "loss": 2.0701, + "step": 738 + }, + { + "epoch": 2.967871485943775, + "grad_norm": 2.714848518371582, + "learning_rate": 2.4096385542168676e-06, + "loss": 2.3578, + "step": 739 + }, + { + "epoch": 2.9718875502008033, + "grad_norm": 3.4589223861694336, + "learning_rate": 2.14190093708166e-06, + "loss": 2.4076, + "step": 740 + }, + { + "epoch": 2.9759036144578315, + "grad_norm": 2.8250577449798584, + "learning_rate": 1.8741633199464526e-06, + "loss": 2.2688, + "step": 741 + }, + { + "epoch": 2.979919678714859, + "grad_norm": 3.090301752090454, + "learning_rate": 1.606425702811245e-06, + "loss": 2.0527, + "step": 742 + }, + { + "epoch": 2.9839357429718874, + "grad_norm": 3.82488751411438, + "learning_rate": 1.3386880856760376e-06, + "loss": 2.9784, + "step": 743 + }, + { + "epoch": 2.9879518072289155, + "grad_norm": 3.046949863433838, + "learning_rate": 1.07095046854083e-06, + "loss": 2.987, + "step": 744 + }, + { + "epoch": 2.9919678714859437, + "grad_norm": 3.08667254447937, + "learning_rate": 8.032128514056225e-07, + "loss": 2.3121, + "step": 745 + }, + { + "epoch": 2.995983935742972, + "grad_norm": 3.114004611968994, + "learning_rate": 5.35475234270415e-07, + "loss": 2.4549, + "step": 746 + }, + { + "epoch": 3.0, + "grad_norm": 3.1294381618499756, + "learning_rate": 2.677376171352075e-07, + "loss": 2.2527, + "step": 747 + }, + { + "epoch": 3.0, + "eval_loss": 0.8732815980911255, + "eval_runtime": 201.6297, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 1.24, + "step": 747 + } + ], + "logging_steps": 1, + "max_steps": 747, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0605631120002253e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}