ggbetz commited on
Commit
032b6ce
·
verified ·
1 Parent(s): a0d4f28

Model save

Browse files
README.md CHANGED
@@ -3,8 +3,8 @@ library_name: transformers
3
  model_name: Phi-4-Argunaut-1-SPIN-dev1
4
  tags:
5
  - generated_from_trainer
6
- - trl
7
  - dpo
 
8
  licence: license
9
  ---
10
 
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/rw9hb1r5)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
 
3
  model_name: Phi-4-Argunaut-1-SPIN-dev1
4
  tags:
5
  - generated_from_trainer
 
6
  - dpo
7
+ - trl
8
  licence: license
9
  ---
10
 
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/xxqqp8c6)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.4061378458120527,
5
- "train_runtime": 4146.7376,
6
- "train_samples": 4358,
7
- "train_samples_per_second": 2.102,
8
- "train_steps_per_second": 0.066
9
  }
 
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.2737119815832267,
5
+ "train_runtime": 3862.0227,
6
+ "train_samples": 5211,
7
+ "train_samples_per_second": 2.699,
8
+ "train_steps_per_second": 0.084
9
  }
model-00001-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0790253cde713484954c768d771b249fa96bdec72fcb90a7fe61308afde03ea8
3
  size 4933658528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e24236fbb6eb1b1dd08f45b733e7d11a8f5c30ec2e27ba0fe29630daa7329f
3
  size 4933658528
model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa0e110663bff5a9b4733400e4adf94cf1b21a373b4a8963e5ecfb8e93df2337
3
  size 4954693112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4c2e57ad5cf1f010ba9d6c84797f82e490bdd911e169e71727635a9cc7bda35
3
  size 4954693112
model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d64d5643daed3ba9e7ad668ff72d194da23b86cf933075267ead23542ea23e8
3
  size 4902243992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3be77aeecea5c0f1fb1f44d9d6f09f2018780173dbcafce90010ab7b77c2b53
3
  size 4902243992
model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9fb96293773e40d31b31b72ac3616598c09c6d178ab2f072a1311c0035fc0ad
3
  size 4954672440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebbf621f807796710922fbd0cb750e413d93ee8185e892352dfb0c1982016a49
3
  size 4954672440
model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46c2df5a5a7713774603a6df2017de6b1488ae1cad2136ceb12a71944ffe551f
3
  size 4954672432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18945e8741d0e6a9a5b24addd883d59e87383c2f1f09bccae6ee72f7f3e6fcee
3
  size 4954672432
model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dba5dbed24be276d6511cdc0832ec499503a9d77a073fcf70ed183adf92bdabf
3
  size 4619116224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0c5346da8c3f1abbfdfe35170c7457618b4f2208b3f5cb42df08f012551c15b
3
  size 4619116224
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.4061378458120527,
5
- "train_runtime": 4146.7376,
6
- "train_samples": 4358,
7
- "train_samples_per_second": 2.102,
8
- "train_steps_per_second": 0.066
9
  }
 
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.2737119815832267,
5
+ "train_runtime": 3862.0227,
6
+ "train_samples": 5211,
7
+ "train_samples_per_second": 2.699,
8
+ "train_steps_per_second": 0.084
9
  }
trainer_state.json CHANGED
@@ -4,833 +4,998 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 274,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03669724770642202,
14
- "grad_norm": 33.22876437768799,
15
- "learning_rate": 1.4285714285714285e-07,
16
- "logits/chosen": -1.9453125,
17
- "logits/rejected": -1.8650391101837158,
18
- "logps/chosen": -187.99374389648438,
19
- "logps/rejected": -300.4921875,
20
- "loss": 0.5688,
21
- "rewards/accuracies": 0.768750011920929,
22
- "rewards/chosen": 0.761932373046875,
23
- "rewards/margins": 1.7284057140350342,
24
- "rewards/rejected": -0.96636962890625,
25
  "step": 5
26
  },
27
  {
28
- "epoch": 0.07339449541284404,
29
- "grad_norm": 19.253905224243972,
30
- "learning_rate": 3.2142857142857145e-07,
31
- "logits/chosen": -1.878515601158142,
32
- "logits/rejected": -1.787695288658142,
33
- "logps/chosen": -177.8874969482422,
34
- "logps/rejected": -218.62344360351562,
35
- "loss": 0.5688,
36
- "rewards/accuracies": 0.7562500238418579,
37
- "rewards/chosen": 0.761035144329071,
38
- "rewards/margins": 1.498071312904358,
39
- "rewards/rejected": -0.7366088628768921,
40
  "step": 10
41
  },
42
  {
43
- "epoch": 0.11009174311926606,
44
- "grad_norm": 20.635311940565074,
45
- "learning_rate": 5e-07,
46
- "logits/chosen": -1.910546898841858,
47
- "logits/rejected": -1.766992211341858,
48
- "logps/chosen": -245.59375,
49
- "logps/rejected": -357.078125,
50
- "loss": 0.4196,
51
- "rewards/accuracies": 0.800000011920929,
52
- "rewards/chosen": 0.6919189691543579,
53
- "rewards/margins": 1.993749976158142,
54
- "rewards/rejected": -1.2986023426055908,
55
  "step": 15
56
  },
57
  {
58
- "epoch": 0.14678899082568808,
59
- "grad_norm": 22.286628210174552,
60
- "learning_rate": 4.903846153846153e-07,
61
- "logits/chosen": -1.884765625,
62
- "logits/rejected": -1.784570336341858,
63
- "logps/chosen": -230.64218139648438,
64
- "logps/rejected": -226.67343139648438,
65
- "loss": 0.4481,
66
- "rewards/accuracies": 0.8125,
67
- "rewards/chosen": 0.9842529296875,
68
- "rewards/margins": 1.7820312976837158,
69
- "rewards/rejected": -0.797027587890625,
70
  "step": 20
71
  },
72
  {
73
- "epoch": 0.1834862385321101,
74
- "grad_norm": 15.503780898074503,
75
- "learning_rate": 4.807692307692307e-07,
76
- "logits/chosen": -1.941796898841858,
77
- "logits/rejected": -1.8427734375,
78
- "logps/chosen": -174.4609375,
79
- "logps/rejected": -234.84530639648438,
80
- "loss": 0.4752,
81
- "rewards/accuracies": 0.75,
82
- "rewards/chosen": 0.92529296875,
83
- "rewards/margins": 1.6923644542694092,
84
- "rewards/rejected": -0.76837158203125,
85
  "step": 25
86
  },
87
  {
88
- "epoch": 0.22018348623853212,
89
- "grad_norm": 24.666179033188698,
90
- "learning_rate": 4.711538461538461e-07,
91
- "logits/chosen": -1.963281273841858,
92
- "logits/rejected": -1.7822265625,
93
- "logps/chosen": -209.16250610351562,
94
- "logps/rejected": -469.15625,
95
- "loss": 0.4731,
96
- "rewards/accuracies": 0.7749999761581421,
97
- "rewards/chosen": 0.8801330327987671,
98
- "rewards/margins": 1.8875243663787842,
99
- "rewards/rejected": -1.007788062095642,
100
  "step": 30
101
  },
102
  {
103
- "epoch": 0.25688073394495414,
104
- "grad_norm": 39.31686614101177,
105
- "learning_rate": 4.6153846153846156e-07,
106
- "logits/chosen": -1.9373047351837158,
107
- "logits/rejected": -1.843164086341858,
108
- "logps/chosen": -229.640625,
109
- "logps/rejected": -256.80157470703125,
110
- "loss": 0.4954,
111
- "rewards/accuracies": 0.7749999761581421,
112
- "rewards/chosen": 0.9551330804824829,
113
- "rewards/margins": 1.919677734375,
114
- "rewards/rejected": -0.9655059576034546,
115
  "step": 35
116
  },
117
  {
118
- "epoch": 0.29357798165137616,
119
- "grad_norm": 25.350998609878594,
120
- "learning_rate": 4.519230769230769e-07,
121
- "logits/chosen": -1.98046875,
122
- "logits/rejected": -1.8894531726837158,
123
- "logps/chosen": -206.1374969482422,
124
- "logps/rejected": -301.48126220703125,
125
- "loss": 0.4124,
126
- "rewards/accuracies": 0.7875000238418579,
127
- "rewards/chosen": 1.19140625,
128
- "rewards/margins": 2.3330321311950684,
129
- "rewards/rejected": -1.14300537109375,
130
  "step": 40
131
  },
132
  {
133
- "epoch": 0.3302752293577982,
134
- "grad_norm": 11.623966835415967,
135
- "learning_rate": 4.423076923076923e-07,
136
- "logits/chosen": -1.8582031726837158,
137
- "logits/rejected": -1.7023437023162842,
138
- "logps/chosen": -171.0906219482422,
139
- "logps/rejected": -261.80938720703125,
140
- "loss": 0.4269,
141
- "rewards/accuracies": 0.800000011920929,
142
- "rewards/chosen": 1.000451683998108,
143
- "rewards/margins": 1.933935523033142,
144
- "rewards/rejected": -0.9315429925918579,
145
  "step": 45
146
  },
147
  {
148
- "epoch": 0.3669724770642202,
149
- "grad_norm": 7.730087812810576,
150
- "learning_rate": 4.326923076923077e-07,
151
- "logits/chosen": -1.804296851158142,
152
- "logits/rejected": -1.739648461341858,
153
- "logps/chosen": -143.0078125,
154
- "logps/rejected": -210.30313110351562,
155
- "loss": 0.3994,
156
- "rewards/accuracies": 0.831250011920929,
157
- "rewards/chosen": 1.30889892578125,
158
- "rewards/margins": 1.947778344154358,
159
- "rewards/rejected": -0.6377807855606079,
160
  "step": 50
161
  },
162
  {
163
- "epoch": 0.4036697247706422,
164
- "grad_norm": 25.104254895206235,
165
- "learning_rate": 4.2307692307692304e-07,
166
- "logits/chosen": -1.8533203601837158,
167
- "logits/rejected": -1.7498047351837158,
168
- "logps/chosen": -167.6453094482422,
169
- "logps/rejected": -299.8343811035156,
170
- "loss": 0.5257,
171
- "rewards/accuracies": 0.762499988079071,
172
- "rewards/chosen": 1.0325927734375,
173
- "rewards/margins": 1.3797729015350342,
174
- "rewards/rejected": -0.34746092557907104,
175
  "step": 55
176
  },
177
  {
178
- "epoch": 0.44036697247706424,
179
- "grad_norm": 46.19801478111461,
180
- "learning_rate": 4.134615384615384e-07,
181
- "logits/chosen": -1.813085913658142,
182
- "logits/rejected": -1.788671851158142,
183
- "logps/chosen": -266.36407470703125,
184
- "logps/rejected": -231.34530639648438,
185
- "loss": 0.553,
186
- "rewards/accuracies": 0.78125,
187
- "rewards/chosen": 1.023596167564392,
188
- "rewards/margins": 1.6953766345977783,
189
- "rewards/rejected": -0.6730102300643921,
190
  "step": 60
191
  },
192
  {
193
- "epoch": 0.47706422018348627,
194
- "grad_norm": 28.971320495320786,
195
- "learning_rate": 4.0384615384615386e-07,
196
- "logits/chosen": -1.806640625,
197
- "logits/rejected": -1.747656226158142,
198
- "logps/chosen": -214.56875610351562,
199
- "logps/rejected": -283.62969970703125,
200
- "loss": 0.4405,
201
- "rewards/accuracies": 0.824999988079071,
202
- "rewards/chosen": 0.7670348882675171,
203
- "rewards/margins": 1.659521460533142,
204
- "rewards/rejected": -0.891857922077179,
205
  "step": 65
206
  },
207
  {
208
- "epoch": 0.5137614678899083,
209
- "grad_norm": 16.80170550474919,
210
- "learning_rate": 3.942307692307692e-07,
211
- "logits/chosen": -1.874609351158142,
212
- "logits/rejected": -1.8097655773162842,
213
- "logps/chosen": -192.9718780517578,
214
- "logps/rejected": -241.9343719482422,
215
- "loss": 0.4856,
216
- "rewards/accuracies": 0.75,
217
- "rewards/chosen": 1.166839599609375,
218
- "rewards/margins": 1.6161620616912842,
219
- "rewards/rejected": -0.45008546113967896,
220
  "step": 70
221
  },
222
  {
223
- "epoch": 0.5504587155963303,
224
- "grad_norm": 26.406721071076017,
225
- "learning_rate": 3.8461538461538463e-07,
226
- "logits/chosen": -1.87109375,
227
- "logits/rejected": -1.772851586341858,
228
- "logps/chosen": -171.234375,
229
- "logps/rejected": -202.3640594482422,
230
- "loss": 0.4186,
231
- "rewards/accuracies": 0.800000011920929,
232
- "rewards/chosen": 1.05938720703125,
233
- "rewards/margins": 1.654272437095642,
234
- "rewards/rejected": -0.5958007574081421,
235
  "step": 75
236
  },
237
  {
238
- "epoch": 0.5871559633027523,
239
- "grad_norm": 12.695931204244943,
240
- "learning_rate": 3.75e-07,
241
- "logits/chosen": -1.9052734375,
242
- "logits/rejected": -1.784570336341858,
243
- "logps/chosen": -179.98281860351562,
244
- "logps/rejected": -362.39373779296875,
245
- "loss": 0.4228,
246
- "rewards/accuracies": 0.8125,
247
- "rewards/chosen": 0.9754394292831421,
248
- "rewards/margins": 1.9592773914337158,
249
- "rewards/rejected": -0.9834350347518921,
250
  "step": 80
251
  },
252
  {
253
- "epoch": 0.6238532110091743,
254
- "grad_norm": 11.877707612125597,
255
- "learning_rate": 3.6538461538461534e-07,
256
- "logits/chosen": -1.870703101158142,
257
- "logits/rejected": -1.8044922351837158,
258
- "logps/chosen": -184.58438110351562,
259
- "logps/rejected": -252.95938110351562,
260
- "loss": 0.4589,
261
- "rewards/accuracies": 0.768750011920929,
262
- "rewards/chosen": 0.9694183468818665,
263
- "rewards/margins": 1.938848853111267,
264
- "rewards/rejected": -0.9698241949081421,
265
  "step": 85
266
  },
267
  {
268
- "epoch": 0.6605504587155964,
269
- "grad_norm": 24.425154245858224,
270
- "learning_rate": 3.557692307692308e-07,
271
- "logits/chosen": -1.81640625,
272
- "logits/rejected": -1.730859398841858,
273
- "logps/chosen": -131.6875,
274
- "logps/rejected": -210.6374969482422,
275
- "loss": 0.3979,
276
- "rewards/accuracies": 0.8187500238418579,
277
- "rewards/chosen": 1.023162841796875,
278
- "rewards/margins": 1.620874047279358,
279
- "rewards/rejected": -0.597125232219696,
280
  "step": 90
281
  },
282
  {
283
- "epoch": 0.6972477064220184,
284
- "grad_norm": 11.31997736864757,
285
- "learning_rate": 3.461538461538461e-07,
286
- "logits/chosen": -1.8603515625,
287
- "logits/rejected": -1.7619140148162842,
288
- "logps/chosen": -187.9718780517578,
289
- "logps/rejected": -293.79998779296875,
290
- "loss": 0.4306,
291
- "rewards/accuracies": 0.793749988079071,
292
- "rewards/chosen": 0.9427169561386108,
293
- "rewards/margins": 2.0055174827575684,
294
- "rewards/rejected": -1.0625121593475342,
295
  "step": 95
296
  },
297
  {
298
- "epoch": 0.7339449541284404,
299
- "grad_norm": 16.984533533471815,
300
- "learning_rate": 3.3653846153846154e-07,
301
- "logits/chosen": -1.8857421875,
302
- "logits/rejected": -1.783789038658142,
303
- "logps/chosen": -161.58749389648438,
304
- "logps/rejected": -282.7953186035156,
305
- "loss": 0.4301,
306
- "rewards/accuracies": 0.78125,
307
- "rewards/chosen": 0.9314972162246704,
308
- "rewards/margins": 1.686669945716858,
309
- "rewards/rejected": -0.7535156011581421,
310
  "step": 100
311
  },
312
  {
313
- "epoch": 0.7706422018348624,
314
- "grad_norm": 22.070587300610953,
315
- "learning_rate": 3.269230769230769e-07,
316
- "logits/chosen": -1.9326171875,
317
- "logits/rejected": -1.821679711341858,
318
- "logps/chosen": -218.49374389648438,
319
- "logps/rejected": -321.09375,
320
- "loss": 0.3785,
321
  "rewards/accuracies": 0.856249988079071,
322
- "rewards/chosen": 1.01708984375,
323
- "rewards/margins": 2.0724120140075684,
324
- "rewards/rejected": -1.054956078529358,
325
  "step": 105
326
  },
327
  {
328
- "epoch": 0.8073394495412844,
329
- "grad_norm": 15.404403703735678,
330
- "learning_rate": 3.1730769230769225e-07,
331
- "logits/chosen": -1.875,
332
- "logits/rejected": -1.7683594226837158,
333
- "logps/chosen": -223.3468780517578,
334
- "logps/rejected": -269.09063720703125,
335
- "loss": 0.3714,
336
- "rewards/accuracies": 0.831250011920929,
337
- "rewards/chosen": 0.7868896722793579,
338
- "rewards/margins": 1.91033935546875,
339
- "rewards/rejected": -1.123510718345642,
340
  "step": 110
341
  },
342
  {
343
- "epoch": 0.8440366972477065,
344
- "grad_norm": 33.94178443490644,
345
- "learning_rate": 3.076923076923077e-07,
346
- "logits/chosen": -1.8820312023162842,
347
- "logits/rejected": -1.7746093273162842,
348
- "logps/chosen": -200.58438110351562,
349
- "logps/rejected": -195.2468719482422,
350
- "loss": 0.4604,
351
- "rewards/accuracies": 0.8187500238418579,
352
- "rewards/chosen": 0.874560534954071,
353
- "rewards/margins": 1.737646460533142,
354
- "rewards/rejected": -0.8634979128837585,
355
  "step": 115
356
  },
357
  {
358
- "epoch": 0.8807339449541285,
359
- "grad_norm": 13.077979257118965,
360
- "learning_rate": 2.980769230769231e-07,
361
- "logits/chosen": -1.856054663658142,
362
- "logits/rejected": -1.794531226158142,
363
- "logps/chosen": -151.5437469482422,
364
- "logps/rejected": -196.953125,
365
- "loss": 0.3459,
366
- "rewards/accuracies": 0.862500011920929,
367
- "rewards/chosen": 1.229943871498108,
368
- "rewards/margins": 2.440661668777466,
369
- "rewards/rejected": -1.2105392217636108,
370
  "step": 120
371
  },
372
  {
373
- "epoch": 0.9174311926605505,
374
- "grad_norm": 30.51657692857472,
375
- "learning_rate": 2.884615384615384e-07,
376
- "logits/chosen": -1.847265601158142,
377
- "logits/rejected": -1.7861328125,
378
- "logps/chosen": -173.4406280517578,
379
- "logps/rejected": -255.5749969482422,
380
- "loss": 0.4275,
381
- "rewards/accuracies": 0.800000011920929,
382
- "rewards/chosen": 0.864672839641571,
383
- "rewards/margins": 1.76495361328125,
384
- "rewards/rejected": -0.89959716796875,
385
  "step": 125
386
  },
387
  {
388
- "epoch": 0.9541284403669725,
389
- "grad_norm": 13.780160219092938,
390
- "learning_rate": 2.7884615384615384e-07,
391
- "logits/chosen": -1.877343773841858,
392
- "logits/rejected": -1.738671898841858,
393
- "logps/chosen": -214.515625,
394
- "logps/rejected": -354.0171813964844,
395
- "loss": 0.3844,
396
- "rewards/accuracies": 0.8374999761581421,
397
- "rewards/chosen": 0.858630359172821,
398
- "rewards/margins": 2.1751465797424316,
399
- "rewards/rejected": -1.316162109375,
400
  "step": 130
401
  },
402
  {
403
- "epoch": 0.9908256880733946,
404
- "grad_norm": 27.1013043867592,
405
- "learning_rate": 2.692307692307692e-07,
406
- "logits/chosen": -1.806054711341858,
407
- "logits/rejected": -1.7755858898162842,
408
- "logps/chosen": -256.57501220703125,
409
- "logps/rejected": -198.11874389648438,
410
- "loss": 0.4336,
411
- "rewards/accuracies": 0.831250011920929,
412
- "rewards/chosen": 0.836621105670929,
413
- "rewards/margins": 1.724884033203125,
414
- "rewards/rejected": -0.8876087069511414,
415
  "step": 135
416
  },
417
  {
418
- "epoch": 1.0220183486238532,
419
- "grad_norm": 13.031886237591982,
420
- "learning_rate": 2.596153846153846e-07,
421
- "logits/chosen": -1.8766084909439087,
422
- "logits/rejected": -1.7973345518112183,
423
- "logps/chosen": -228.58456420898438,
424
- "logps/rejected": -230.00735473632812,
425
- "loss": 0.4131,
426
- "rewards/accuracies": 0.8602941036224365,
427
- "rewards/chosen": 0.6476907134056091,
428
- "rewards/margins": 1.6508358716964722,
429
- "rewards/rejected": -1.0023910999298096,
430
  "step": 140
431
  },
432
  {
433
- "epoch": 1.0587155963302752,
434
- "grad_norm": 7.580044936631597,
435
- "learning_rate": 2.5e-07,
436
- "logits/chosen": -1.8037109375,
437
- "logits/rejected": -1.7208983898162842,
438
- "logps/chosen": -170.09375,
439
- "logps/rejected": -344.7437438964844,
440
- "loss": 0.4014,
441
- "rewards/accuracies": 0.8062499761581421,
442
- "rewards/chosen": 1.0894775390625,
443
- "rewards/margins": 1.9444580078125,
444
- "rewards/rejected": -0.85498046875,
445
  "step": 145
446
  },
447
  {
448
- "epoch": 1.0954128440366973,
449
- "grad_norm": 15.690513831070637,
450
- "learning_rate": 2.4038461538461537e-07,
451
- "logits/chosen": -1.822656273841858,
452
- "logits/rejected": -1.779296875,
453
- "logps/chosen": -248.88125610351562,
454
- "logps/rejected": -325.3500061035156,
455
- "loss": 0.3528,
456
- "rewards/accuracies": 0.824999988079071,
457
- "rewards/chosen": 0.877734363079071,
458
- "rewards/margins": 2.3188233375549316,
459
- "rewards/rejected": -1.4415161609649658,
460
  "step": 150
461
  },
462
  {
463
- "epoch": 1.1321100917431193,
464
- "grad_norm": 15.215220163021907,
465
- "learning_rate": 2.3076923076923078e-07,
466
- "logits/chosen": -1.85546875,
467
- "logits/rejected": -1.762109398841858,
468
- "logps/chosen": -178.05313110351562,
469
- "logps/rejected": -250.49374389648438,
470
- "loss": 0.4006,
471
- "rewards/accuracies": 0.800000011920929,
472
- "rewards/chosen": 0.9465576410293579,
473
- "rewards/margins": 1.915624976158142,
474
- "rewards/rejected": -0.970104992389679,
475
  "step": 155
476
  },
477
  {
478
- "epoch": 1.1688073394495413,
479
- "grad_norm": 18.54494454979772,
480
- "learning_rate": 2.2115384615384614e-07,
481
- "logits/chosen": -1.858007788658142,
482
- "logits/rejected": -1.782812476158142,
483
- "logps/chosen": -179.8249969482422,
484
- "logps/rejected": -342.28125,
485
- "loss": 0.378,
486
- "rewards/accuracies": 0.8062499761581421,
487
- "rewards/chosen": 0.7204223871231079,
488
- "rewards/margins": 2.033404588699341,
489
- "rewards/rejected": -1.3119995594024658,
490
  "step": 160
491
  },
492
  {
493
- "epoch": 1.2055045871559633,
494
- "grad_norm": 11.492838106006257,
495
- "learning_rate": 2.1153846153846152e-07,
496
- "logits/chosen": -1.938867211341858,
497
- "logits/rejected": -1.78515625,
498
- "logps/chosen": -210.4499969482422,
499
- "logps/rejected": -346.09375,
500
- "loss": 0.3104,
501
- "rewards/accuracies": 0.8687499761581421,
502
- "rewards/chosen": 0.9412597417831421,
503
- "rewards/margins": 2.375317335128784,
504
- "rewards/rejected": -1.43389892578125,
505
  "step": 165
506
  },
507
  {
508
- "epoch": 1.2422018348623853,
509
- "grad_norm": 22.063054783212692,
510
- "learning_rate": 2.0192307692307693e-07,
511
- "logits/chosen": -1.927148461341858,
512
- "logits/rejected": -1.8445312976837158,
513
- "logps/chosen": -182.09375,
514
- "logps/rejected": -195.45938110351562,
515
- "loss": 0.3562,
516
- "rewards/accuracies": 0.8125,
517
- "rewards/chosen": 0.888519287109375,
518
- "rewards/margins": 2.130175828933716,
519
- "rewards/rejected": -1.2406799793243408,
520
  "step": 170
521
  },
522
  {
523
- "epoch": 1.2788990825688074,
524
- "grad_norm": 18.824995052782572,
525
- "learning_rate": 1.9230769230769231e-07,
526
- "logits/chosen": -1.906835913658142,
527
- "logits/rejected": -1.78125,
528
- "logps/chosen": -199.08438110351562,
529
- "logps/rejected": -278.6812438964844,
530
- "loss": 0.3636,
531
- "rewards/accuracies": 0.8187500238418579,
532
- "rewards/chosen": 0.8860107660293579,
533
- "rewards/margins": 2.08349609375,
534
- "rewards/rejected": -1.1968567371368408,
535
  "step": 175
536
  },
537
  {
538
- "epoch": 1.3155963302752294,
539
- "grad_norm": 5.927489409391164,
540
- "learning_rate": 1.8269230769230767e-07,
541
- "logits/chosen": -1.8244140148162842,
542
- "logits/rejected": -1.7685546875,
543
- "logps/chosen": -175.609375,
544
- "logps/rejected": -202.66250610351562,
545
- "loss": 0.352,
546
- "rewards/accuracies": 0.8374999761581421,
547
- "rewards/chosen": 0.9653075933456421,
548
- "rewards/margins": 1.90960693359375,
549
- "rewards/rejected": -0.943652331829071,
550
  "step": 180
551
  },
552
  {
553
- "epoch": 1.3522935779816514,
554
- "grad_norm": 9.770113429927585,
555
- "learning_rate": 1.7307692307692305e-07,
556
- "logits/chosen": -1.9005858898162842,
557
- "logits/rejected": -1.786523461341858,
558
- "logps/chosen": -218.546875,
559
- "logps/rejected": -410.6187438964844,
560
- "loss": 0.3399,
561
- "rewards/accuracies": 0.824999988079071,
562
- "rewards/chosen": 0.8787597417831421,
563
- "rewards/margins": 2.3956542015075684,
564
- "rewards/rejected": -1.516717553138733,
565
  "step": 185
566
  },
567
  {
568
- "epoch": 1.3889908256880734,
569
- "grad_norm": 17.54848745249605,
570
- "learning_rate": 1.6346153846153846e-07,
571
- "logits/chosen": -1.9091796875,
572
- "logits/rejected": -1.811914086341858,
573
- "logps/chosen": -160.11874389648438,
574
- "logps/rejected": -216.6921844482422,
575
- "loss": 0.3768,
576
- "rewards/accuracies": 0.8374999761581421,
577
- "rewards/chosen": 0.76153564453125,
578
- "rewards/margins": 2.0511474609375,
579
- "rewards/rejected": -1.288671851158142,
580
  "step": 190
581
  },
582
  {
583
- "epoch": 1.4256880733944954,
584
- "grad_norm": 10.046035163946694,
585
- "learning_rate": 1.5384615384615385e-07,
586
- "logits/chosen": -1.866601586341858,
587
- "logits/rejected": -1.762304663658142,
588
- "logps/chosen": -152.25625610351562,
589
- "logps/rejected": -212.5749969482422,
590
- "loss": 0.3387,
591
- "rewards/accuracies": 0.8500000238418579,
592
- "rewards/chosen": 0.7191314697265625,
593
- "rewards/margins": 2.0878663063049316,
594
- "rewards/rejected": -1.3691924810409546,
595
  "step": 195
596
  },
597
  {
598
- "epoch": 1.4623853211009175,
599
- "grad_norm": 24.64891684255534,
600
- "learning_rate": 1.442307692307692e-07,
601
- "logits/chosen": -1.852148413658142,
602
- "logits/rejected": -1.7707030773162842,
603
- "logps/chosen": -150.7468719482422,
604
- "logps/rejected": -293.0625,
605
- "loss": 0.3384,
606
- "rewards/accuracies": 0.8500000238418579,
607
- "rewards/chosen": 0.9279540777206421,
608
- "rewards/margins": 2.073071241378784,
609
- "rewards/rejected": -1.145782470703125,
610
  "step": 200
611
  },
612
  {
613
- "epoch": 1.4990825688073395,
614
- "grad_norm": 10.264189832377008,
615
- "learning_rate": 1.346153846153846e-07,
616
- "logits/chosen": -1.8664062023162842,
617
- "logits/rejected": -1.7138671875,
618
- "logps/chosen": -186.28750610351562,
619
- "logps/rejected": -292.2749938964844,
620
- "loss": 0.3129,
621
- "rewards/accuracies": 0.887499988079071,
622
- "rewards/chosen": 0.9490722417831421,
623
- "rewards/margins": 2.3033204078674316,
624
- "rewards/rejected": -1.3525269031524658,
625
  "step": 205
626
  },
627
  {
628
- "epoch": 1.5357798165137615,
629
- "grad_norm": 29.504068972664967,
630
- "learning_rate": 1.25e-07,
631
- "logits/chosen": -1.8708984851837158,
632
- "logits/rejected": -1.767578125,
633
- "logps/chosen": -234.1750030517578,
634
- "logps/rejected": -315.8374938964844,
635
- "loss": 0.3891,
636
- "rewards/accuracies": 0.8187500238418579,
637
- "rewards/chosen": 0.8542724847793579,
638
- "rewards/margins": 2.066845655441284,
639
- "rewards/rejected": -1.2125946283340454,
640
  "step": 210
641
  },
642
  {
643
- "epoch": 1.5724770642201835,
644
- "grad_norm": 11.711474643815764,
645
- "learning_rate": 1.1538461538461539e-07,
646
- "logits/chosen": -1.9169921875,
647
- "logits/rejected": -1.769921898841858,
648
- "logps/chosen": -176.625,
649
- "logps/rejected": -314.76873779296875,
650
- "loss": 0.3676,
651
- "rewards/accuracies": 0.800000011920929,
652
- "rewards/chosen": 1.0230834484100342,
653
- "rewards/margins": 2.096282958984375,
654
- "rewards/rejected": -1.0709412097930908,
655
  "step": 215
656
  },
657
  {
658
- "epoch": 1.6091743119266055,
659
- "grad_norm": 12.471272777348215,
660
- "learning_rate": 1.0576923076923076e-07,
661
- "logits/chosen": -1.8542969226837158,
662
- "logits/rejected": -1.773046851158142,
663
- "logps/chosen": -201.97811889648438,
664
- "logps/rejected": -332.16876220703125,
665
- "loss": 0.3545,
666
- "rewards/accuracies": 0.8374999761581421,
667
- "rewards/chosen": 0.979199230670929,
668
- "rewards/margins": 2.747509717941284,
669
- "rewards/rejected": -1.767333984375,
670
  "step": 220
671
  },
672
  {
673
- "epoch": 1.6458715596330276,
674
- "grad_norm": 14.651904183676013,
675
- "learning_rate": 9.615384615384616e-08,
676
- "logits/chosen": -1.8474609851837158,
677
- "logits/rejected": -1.7412109375,
678
- "logps/chosen": -167.25,
679
- "logps/rejected": -215.5812530517578,
680
- "loss": 0.3734,
681
- "rewards/accuracies": 0.800000011920929,
682
- "rewards/chosen": 0.9526001214981079,
683
- "rewards/margins": 1.8461425304412842,
684
- "rewards/rejected": -0.8924926519393921,
685
  "step": 225
686
  },
687
  {
688
- "epoch": 1.6825688073394496,
689
- "grad_norm": 14.445950528315446,
690
- "learning_rate": 8.653846153846153e-08,
691
- "logits/chosen": -1.88720703125,
692
- "logits/rejected": -1.7849609851837158,
693
- "logps/chosen": -163.72811889648438,
694
- "logps/rejected": -273.8656311035156,
695
- "loss": 0.3905,
696
- "rewards/accuracies": 0.8500000238418579,
697
- "rewards/chosen": 0.879486083984375,
698
- "rewards/margins": 2.225268602371216,
699
- "rewards/rejected": -1.3448257446289062,
700
  "step": 230
701
  },
702
  {
703
- "epoch": 1.7192660550458716,
704
- "grad_norm": 11.945778792987594,
705
- "learning_rate": 7.692307692307692e-08,
706
- "logits/chosen": -1.954492211341858,
707
- "logits/rejected": -1.8380858898162842,
708
- "logps/chosen": -196.75,
709
- "logps/rejected": -289.890625,
710
- "loss": 0.333,
711
- "rewards/accuracies": 0.8374999761581421,
712
- "rewards/chosen": 0.77606201171875,
713
- "rewards/margins": 2.102343797683716,
714
- "rewards/rejected": -1.3264648914337158,
715
  "step": 235
716
  },
717
  {
718
- "epoch": 1.7559633027522936,
719
- "grad_norm": 15.570589391253572,
720
- "learning_rate": 6.73076923076923e-08,
721
- "logits/chosen": -1.8849608898162842,
722
- "logits/rejected": -1.7628905773162842,
723
- "logps/chosen": -175.00625610351562,
724
- "logps/rejected": -219.3249969482422,
725
- "loss": 0.327,
726
  "rewards/accuracies": 0.8812500238418579,
727
- "rewards/chosen": 0.804931640625,
728
- "rewards/margins": 2.1232666969299316,
729
- "rewards/rejected": -1.3185760974884033,
730
  "step": 240
731
  },
732
  {
733
- "epoch": 1.7926605504587156,
734
- "grad_norm": 29.853484257932976,
735
- "learning_rate": 5.7692307692307695e-08,
736
- "logits/chosen": -1.826757788658142,
737
- "logits/rejected": -1.7771484851837158,
738
- "logps/chosen": -270.60467529296875,
739
- "logps/rejected": -215.1843719482422,
740
- "loss": 0.3607,
741
  "rewards/accuracies": 0.8187500238418579,
742
- "rewards/chosen": 0.9619140625,
743
- "rewards/margins": 2.0130372047424316,
744
- "rewards/rejected": -1.0504271984100342,
745
  "step": 245
746
  },
747
  {
748
- "epoch": 1.8293577981651377,
749
- "grad_norm": 18.077930531222716,
750
- "learning_rate": 4.807692307692308e-08,
751
- "logits/chosen": -1.881250023841858,
752
- "logits/rejected": -1.826562523841858,
753
- "logps/chosen": -196.35311889648438,
754
- "logps/rejected": -277.0562438964844,
755
- "loss": 0.3781,
756
- "rewards/accuracies": 0.831250011920929,
757
- "rewards/chosen": 0.970629870891571,
758
- "rewards/margins": 2.069580078125,
759
- "rewards/rejected": -1.1002318859100342,
760
  "step": 250
761
  },
762
  {
763
- "epoch": 1.8660550458715597,
764
- "grad_norm": 13.62739433233727,
765
- "learning_rate": 3.846153846153846e-08,
766
- "logits/chosen": -1.826757788658142,
767
- "logits/rejected": -1.7664062976837158,
768
- "logps/chosen": -138.7578125,
769
- "logps/rejected": -210.30313110351562,
770
- "loss": 0.4098,
771
- "rewards/accuracies": 0.8187500238418579,
772
- "rewards/chosen": 0.861804187297821,
773
- "rewards/margins": 1.5693480968475342,
774
- "rewards/rejected": -0.706738293170929,
775
  "step": 255
776
  },
777
  {
778
- "epoch": 1.9027522935779817,
779
- "grad_norm": 12.688517830934348,
780
- "learning_rate": 2.8846153846153848e-08,
781
- "logits/chosen": -1.883203148841858,
782
- "logits/rejected": -1.759765625,
783
- "logps/chosen": -167.63125610351562,
784
- "logps/rejected": -209.3625030517578,
785
- "loss": 0.3561,
786
- "rewards/accuracies": 0.8500000238418579,
787
- "rewards/chosen": 0.8629516363143921,
788
- "rewards/margins": 1.875878930091858,
789
- "rewards/rejected": -1.0120147466659546,
790
  "step": 260
791
  },
792
  {
793
- "epoch": 1.9394495412844037,
794
- "grad_norm": 45.414007378020266,
795
- "learning_rate": 1.923076923076923e-08,
796
- "logits/chosen": -1.849023461341858,
797
- "logits/rejected": -1.7927734851837158,
798
- "logps/chosen": -339.13751220703125,
799
- "logps/rejected": -384.7906188964844,
800
- "loss": 0.4754,
801
- "rewards/accuracies": 0.7749999761581421,
802
- "rewards/chosen": 0.6140381097793579,
803
- "rewards/margins": 2.0094971656799316,
804
- "rewards/rejected": -1.395416259765625,
805
  "step": 265
806
  },
807
  {
808
- "epoch": 1.9761467889908257,
809
- "grad_norm": 33.400762277633376,
810
- "learning_rate": 9.615384615384615e-09,
811
- "logits/chosen": -1.873437523841858,
812
- "logits/rejected": -1.7628905773162842,
813
- "logps/chosen": -183.55624389648438,
814
- "logps/rejected": -175.9093780517578,
815
- "loss": 0.4154,
816
- "rewards/accuracies": 0.8812500238418579,
817
- "rewards/chosen": 0.709582507610321,
818
- "rewards/margins": 1.754003882408142,
819
- "rewards/rejected": -1.044396996498108,
820
  "step": 270
821
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822
  {
823
  "epoch": 2.0,
824
- "step": 274,
825
  "total_flos": 0.0,
826
- "train_loss": 0.4061378458120527,
827
- "train_runtime": 4146.7376,
828
- "train_samples_per_second": 2.102,
829
- "train_steps_per_second": 0.066
830
  }
831
  ],
832
  "logging_steps": 5,
833
- "max_steps": 274,
834
  "num_input_tokens_seen": 0,
835
  "num_train_epochs": 2,
836
  "save_steps": 50,
 
4
  "best_model_checkpoint": null,
5
  "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 326,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.03069838833461243,
14
+ "grad_norm": 11.121566696073334,
15
+ "learning_rate": 1.176470588235294e-07,
16
+ "logits/chosen": -2.22265625,
17
+ "logits/rejected": -2.1064453125,
18
+ "logps/chosen": -227.35311889648438,
19
+ "logps/rejected": -388.5625,
20
+ "loss": 0.3583,
21
+ "rewards/accuracies": 0.793749988079071,
22
+ "rewards/chosen": 1.9202148914337158,
23
+ "rewards/margins": 3.3916258811950684,
24
+ "rewards/rejected": -1.4735596179962158,
25
  "step": 5
26
  },
27
  {
28
+ "epoch": 0.06139677666922486,
29
+ "grad_norm": 20.63353450702747,
30
+ "learning_rate": 2.6470588235294114e-07,
31
+ "logits/chosen": -2.253124952316284,
32
+ "logits/rejected": -2.0814452171325684,
33
+ "logps/chosen": -248.7624969482422,
34
+ "logps/rejected": -484.30938720703125,
35
+ "loss": 0.3918,
36
+ "rewards/accuracies": 0.824999988079071,
37
+ "rewards/chosen": 2.1537842750549316,
38
+ "rewards/margins": 4.004614353179932,
39
+ "rewards/rejected": -1.8481566905975342,
40
  "step": 10
41
  },
42
  {
43
+ "epoch": 0.0920951650038373,
44
+ "grad_norm": 16.123416834236878,
45
+ "learning_rate": 4.117647058823529e-07,
46
+ "logits/chosen": -2.2552733421325684,
47
+ "logits/rejected": -2.0552735328674316,
48
+ "logps/chosen": -299.7796936035156,
49
+ "logps/rejected": -378.203125,
50
+ "loss": 0.3305,
51
+ "rewards/accuracies": 0.8125,
52
+ "rewards/chosen": 2.6285157203674316,
53
+ "rewards/margins": 3.8902344703674316,
54
+ "rewards/rejected": -1.26220703125,
55
  "step": 15
56
  },
57
  {
58
+ "epoch": 0.12279355333844973,
59
+ "grad_norm": 12.29166671815314,
60
+ "learning_rate": 4.967637540453074e-07,
61
+ "logits/chosen": -2.2583985328674316,
62
+ "logits/rejected": -2.017773389816284,
63
+ "logps/chosen": -316.33123779296875,
64
+ "logps/rejected": -459.39373779296875,
65
+ "loss": 0.294,
66
+ "rewards/accuracies": 0.8374999761581421,
67
+ "rewards/chosen": 2.122448682785034,
68
+ "rewards/margins": 4.082617282867432,
69
+ "rewards/rejected": -1.956140160560608,
70
  "step": 20
71
  },
72
  {
73
+ "epoch": 0.15349194167306215,
74
+ "grad_norm": 17.064517327038068,
75
+ "learning_rate": 4.886731391585761e-07,
76
+ "logits/chosen": -2.290234327316284,
77
+ "logits/rejected": -1.9728515148162842,
78
+ "logps/chosen": -361.1000061035156,
79
+ "logps/rejected": -554.6124877929688,
80
+ "loss": 0.2684,
81
+ "rewards/accuracies": 0.862500011920929,
82
+ "rewards/chosen": 2.3677735328674316,
83
+ "rewards/margins": 4.116113185882568,
84
+ "rewards/rejected": -1.744531273841858,
85
  "step": 25
86
  },
87
  {
88
+ "epoch": 0.1841903300076746,
89
+ "grad_norm": 11.397428856197525,
90
+ "learning_rate": 4.805825242718447e-07,
91
+ "logits/chosen": -2.171679735183716,
92
+ "logits/rejected": -1.9724609851837158,
93
+ "logps/chosen": -302.3374938964844,
94
+ "logps/rejected": -366.83123779296875,
95
+ "loss": 0.3434,
96
+ "rewards/accuracies": 0.8687499761581421,
97
+ "rewards/chosen": 1.95025634765625,
98
+ "rewards/margins": 3.369799852371216,
99
+ "rewards/rejected": -1.4193115234375,
100
  "step": 30
101
  },
102
  {
103
+ "epoch": 0.21488871834228704,
104
+ "grad_norm": 17.965759727474417,
105
+ "learning_rate": 4.724919093851132e-07,
106
+ "logits/chosen": -2.247851610183716,
107
+ "logits/rejected": -2.120898485183716,
108
+ "logps/chosen": -242.43594360351562,
109
+ "logps/rejected": -474.0562438964844,
110
+ "loss": 0.291,
111
+ "rewards/accuracies": 0.856249988079071,
112
+ "rewards/chosen": 2.177441358566284,
113
+ "rewards/margins": 3.690624952316284,
114
+ "rewards/rejected": -1.5150024890899658,
115
  "step": 35
116
  },
117
  {
118
+ "epoch": 0.24558710667689945,
119
+ "grad_norm": 16.842023758004515,
120
+ "learning_rate": 4.6440129449838184e-07,
121
+ "logits/chosen": -2.28515625,
122
+ "logits/rejected": -2.058789014816284,
123
+ "logps/chosen": -255.53125,
124
+ "logps/rejected": -484.47186279296875,
125
+ "loss": 0.3384,
126
+ "rewards/accuracies": 0.800000011920929,
127
+ "rewards/chosen": 2.153881788253784,
128
+ "rewards/margins": 3.767773389816284,
129
+ "rewards/rejected": -1.615258812904358,
130
  "step": 40
131
  },
132
  {
133
+ "epoch": 0.2762854950115119,
134
+ "grad_norm": 22.9317252710755,
135
+ "learning_rate": 4.563106796116505e-07,
136
+ "logits/chosen": -2.24609375,
137
+ "logits/rejected": -2.001171827316284,
138
+ "logps/chosen": -327.17498779296875,
139
+ "logps/rejected": -405.9375,
140
+ "loss": 0.2816,
141
+ "rewards/accuracies": 0.84375,
142
+ "rewards/chosen": 2.350268602371216,
143
+ "rewards/margins": 4.024218559265137,
144
+ "rewards/rejected": -1.6744873523712158,
145
  "step": 45
146
  },
147
  {
148
+ "epoch": 0.3069838833461243,
149
+ "grad_norm": 15.527675742487922,
150
+ "learning_rate": 4.4822006472491906e-07,
151
+ "logits/chosen": -2.308789014816284,
152
+ "logits/rejected": -2.0882811546325684,
153
+ "logps/chosen": -244.3156280517578,
154
+ "logps/rejected": -342.50311279296875,
155
+ "loss": 0.3627,
156
+ "rewards/accuracies": 0.8187500238418579,
157
+ "rewards/chosen": 2.4039337635040283,
158
+ "rewards/margins": 3.5143370628356934,
159
+ "rewards/rejected": -1.1082031726837158,
160
  "step": 50
161
  },
162
  {
163
+ "epoch": 0.3376822716807368,
164
+ "grad_norm": 21.127864486539234,
165
+ "learning_rate": 4.4012944983818767e-07,
166
+ "logits/chosen": -2.2890625,
167
+ "logits/rejected": -2.038281202316284,
168
+ "logps/chosen": -308.95001220703125,
169
+ "logps/rejected": -414.5375061035156,
170
+ "loss": 0.2959,
171
+ "rewards/accuracies": 0.893750011920929,
172
+ "rewards/chosen": 2.238818407058716,
173
+ "rewards/margins": 3.924755811691284,
174
+ "rewards/rejected": -1.6855957508087158,
175
  "step": 55
176
  },
177
  {
178
+ "epoch": 0.3683806600153492,
179
+ "grad_norm": 52.75258924580777,
180
+ "learning_rate": 4.320388349514563e-07,
181
+ "logits/chosen": -2.2689452171325684,
182
+ "logits/rejected": -2.0503907203674316,
183
+ "logps/chosen": -264.5375061035156,
184
+ "logps/rejected": -451.8687438964844,
185
+ "loss": 0.3218,
186
+ "rewards/accuracies": 0.84375,
187
+ "rewards/chosen": 1.5753967761993408,
188
+ "rewards/margins": 4.02734375,
189
+ "rewards/rejected": -2.451855421066284,
190
  "step": 60
191
  },
192
  {
193
+ "epoch": 0.3990790483499616,
194
+ "grad_norm": 20.196062332868237,
195
+ "learning_rate": 4.239482200647249e-07,
196
+ "logits/chosen": -2.2076172828674316,
197
+ "logits/rejected": -1.9767577648162842,
198
+ "logps/chosen": -292.6937561035156,
199
+ "logps/rejected": -459.34375,
200
+ "loss": 0.3184,
201
+ "rewards/accuracies": 0.875,
202
+ "rewards/chosen": 2.086029052734375,
203
+ "rewards/margins": 3.979687452316284,
204
+ "rewards/rejected": -1.89422607421875,
205
  "step": 65
206
  },
207
  {
208
+ "epoch": 0.4297774366845741,
209
+ "grad_norm": 20.093203861551977,
210
+ "learning_rate": 4.158576051779935e-07,
211
+ "logits/chosen": -2.251757860183716,
212
+ "logits/rejected": -2.056640625,
213
+ "logps/chosen": -290.328125,
214
+ "logps/rejected": -389.9156188964844,
215
+ "loss": 0.2854,
216
+ "rewards/accuracies": 0.8812500238418579,
217
+ "rewards/chosen": 2.213456630706787,
218
+ "rewards/margins": 3.6863036155700684,
219
+ "rewards/rejected": -1.474511742591858,
220
  "step": 70
221
  },
222
  {
223
+ "epoch": 0.4604758250191865,
224
+ "grad_norm": 19.79634574619263,
225
+ "learning_rate": 4.077669902912621e-07,
226
+ "logits/chosen": -2.2406249046325684,
227
+ "logits/rejected": -2.00390625,
228
+ "logps/chosen": -293.62811279296875,
229
+ "logps/rejected": -435.84375,
230
+ "loss": 0.2902,
231
+ "rewards/accuracies": 0.84375,
232
+ "rewards/chosen": 1.892968773841858,
233
+ "rewards/margins": 3.774218797683716,
234
+ "rewards/rejected": -1.883551001548767,
235
  "step": 75
236
  },
237
  {
238
+ "epoch": 0.4911742133537989,
239
+ "grad_norm": 19.181029218450373,
240
+ "learning_rate": 3.9967637540453073e-07,
241
+ "logits/chosen": -2.2816405296325684,
242
+ "logits/rejected": -2.0777344703674316,
243
+ "logps/chosen": -301.49688720703125,
244
+ "logps/rejected": -399.69061279296875,
245
+ "loss": 0.2902,
246
+ "rewards/accuracies": 0.8812500238418579,
247
+ "rewards/chosen": 2.0974364280700684,
248
+ "rewards/margins": 3.5245604515075684,
249
+ "rewards/rejected": -1.4281127452850342,
250
  "step": 80
251
  },
252
  {
253
+ "epoch": 0.5218726016884113,
254
+ "grad_norm": 14.675252239263,
255
+ "learning_rate": 3.9158576051779934e-07,
256
+ "logits/chosen": -2.2740235328674316,
257
+ "logits/rejected": -2.0689454078674316,
258
+ "logps/chosen": -294.4234313964844,
259
+ "logps/rejected": -564.1343994140625,
260
+ "loss": 0.2916,
261
+ "rewards/accuracies": 0.8687499761581421,
262
+ "rewards/chosen": 1.751062035560608,
263
+ "rewards/margins": 3.894702196121216,
264
+ "rewards/rejected": -2.143725633621216,
265
  "step": 85
266
  },
267
  {
268
+ "epoch": 0.5525709900230238,
269
+ "grad_norm": 23.299079990362248,
270
+ "learning_rate": 3.8349514563106795e-07,
271
+ "logits/chosen": -2.2728514671325684,
272
+ "logits/rejected": -2.0404295921325684,
273
+ "logps/chosen": -317.50311279296875,
274
+ "logps/rejected": -462.9156188964844,
275
+ "loss": 0.2674,
276
+ "rewards/accuracies": 0.8999999761581421,
277
+ "rewards/chosen": 2.2327880859375,
278
+ "rewards/margins": 3.934765577316284,
279
+ "rewards/rejected": -1.7007324695587158,
280
  "step": 90
281
  },
282
  {
283
+ "epoch": 0.5832693783576363,
284
+ "grad_norm": 55.83907641753374,
285
+ "learning_rate": 3.754045307443365e-07,
286
+ "logits/chosen": -2.2884764671325684,
287
+ "logits/rejected": -2.100390672683716,
288
+ "logps/chosen": -415.45623779296875,
289
+ "logps/rejected": -376.41876220703125,
290
+ "loss": 0.3104,
291
+ "rewards/accuracies": 0.90625,
292
+ "rewards/chosen": 2.108752489089966,
293
+ "rewards/margins": 4.00244140625,
294
+ "rewards/rejected": -1.894140601158142,
295
  "step": 95
296
  },
297
  {
298
+ "epoch": 0.6139677666922486,
299
+ "grad_norm": 30.671605767336874,
300
+ "learning_rate": 3.673139158576052e-07,
301
+ "logits/chosen": -2.287890672683716,
302
+ "logits/rejected": -2.02734375,
303
+ "logps/chosen": -316.25,
304
+ "logps/rejected": -460.70001220703125,
305
+ "loss": 0.2841,
306
+ "rewards/accuracies": 0.8687499761581421,
307
+ "rewards/chosen": 1.875756859779358,
308
+ "rewards/margins": 3.8394532203674316,
309
+ "rewards/rejected": -1.9620239734649658,
310
  "step": 100
311
  },
312
  {
313
+ "epoch": 0.6446661550268611,
314
+ "grad_norm": 20.82501262179368,
315
+ "learning_rate": 3.592233009708738e-07,
316
+ "logits/chosen": -2.256054639816284,
317
+ "logits/rejected": -2.04296875,
318
+ "logps/chosen": -324.45623779296875,
319
+ "logps/rejected": -504.0406188964844,
320
+ "loss": 0.2799,
321
  "rewards/accuracies": 0.856249988079071,
322
+ "rewards/chosen": 2.1114563941955566,
323
+ "rewards/margins": 4.019073486328125,
324
+ "rewards/rejected": -1.9062011241912842,
325
  "step": 105
326
  },
327
  {
328
+ "epoch": 0.6753645433614736,
329
+ "grad_norm": 33.80916604419537,
330
+ "learning_rate": 3.5113268608414234e-07,
331
+ "logits/chosen": -2.2220702171325684,
332
+ "logits/rejected": -2.0341796875,
333
+ "logps/chosen": -315.27032470703125,
334
+ "logps/rejected": -288.078125,
335
+ "loss": 0.2677,
336
+ "rewards/accuracies": 0.893750011920929,
337
+ "rewards/chosen": 2.1306395530700684,
338
+ "rewards/margins": 3.439257860183716,
339
+ "rewards/rejected": -1.3082396984100342,
340
  "step": 110
341
  },
342
  {
343
+ "epoch": 0.7060629316960859,
344
+ "grad_norm": 22.479271981046605,
345
+ "learning_rate": 3.4304207119741096e-07,
346
+ "logits/chosen": -2.2115235328674316,
347
+ "logits/rejected": -1.968359351158142,
348
+ "logps/chosen": -346.90625,
349
+ "logps/rejected": -690.7062377929688,
350
+ "loss": 0.3067,
351
+ "rewards/accuracies": 0.887499988079071,
352
+ "rewards/chosen": 1.858984351158142,
353
+ "rewards/margins": 4.519238471984863,
354
+ "rewards/rejected": -2.662036180496216,
355
  "step": 115
356
  },
357
  {
358
+ "epoch": 0.7367613200306984,
359
+ "grad_norm": 22.82747748987554,
360
+ "learning_rate": 3.349514563106796e-07,
361
+ "logits/chosen": -2.2685546875,
362
+ "logits/rejected": -2.046093702316284,
363
+ "logps/chosen": -322.59375,
364
+ "logps/rejected": -554.1500244140625,
365
+ "loss": 0.3413,
366
+ "rewards/accuracies": 0.8687499761581421,
367
+ "rewards/chosen": 1.87310791015625,
368
+ "rewards/margins": 4.309618949890137,
369
+ "rewards/rejected": -2.433239698410034,
370
  "step": 120
371
  },
372
  {
373
+ "epoch": 0.7674597083653109,
374
+ "grad_norm": 22.397741857744723,
375
+ "learning_rate": 3.2686084142394823e-07,
376
+ "logits/chosen": -2.2513670921325684,
377
+ "logits/rejected": -2.0927734375,
378
+ "logps/chosen": -302.29998779296875,
379
+ "logps/rejected": -564.328125,
380
+ "loss": 0.2886,
381
+ "rewards/accuracies": 0.856249988079071,
382
+ "rewards/chosen": 2.023284912109375,
383
+ "rewards/margins": 3.8551268577575684,
384
+ "rewards/rejected": -1.830969214439392,
385
  "step": 125
386
  },
387
  {
388
+ "epoch": 0.7981580966999232,
389
+ "grad_norm": 8.578228102008167,
390
+ "learning_rate": 3.187702265372168e-07,
391
+ "logits/chosen": -2.2466797828674316,
392
+ "logits/rejected": -2.034374952316284,
393
+ "logps/chosen": -324.86248779296875,
394
+ "logps/rejected": -450.6187438964844,
395
+ "loss": 0.2982,
396
+ "rewards/accuracies": 0.8999999761581421,
397
+ "rewards/chosen": 2.017773389816284,
398
+ "rewards/margins": 4.181738376617432,
399
+ "rewards/rejected": -2.1627564430236816,
400
  "step": 130
401
  },
402
  {
403
+ "epoch": 0.8288564850345357,
404
+ "grad_norm": 9.589882721872858,
405
+ "learning_rate": 3.106796116504854e-07,
406
+ "logits/chosen": -2.204882860183716,
407
+ "logits/rejected": -2.0736327171325684,
408
+ "logps/chosen": -283.6703186035156,
409
+ "logps/rejected": -438.23126220703125,
410
+ "loss": 0.3102,
411
+ "rewards/accuracies": 0.856249988079071,
412
+ "rewards/chosen": 1.94146728515625,
413
+ "rewards/margins": 3.675122022628784,
414
+ "rewards/rejected": -1.735009789466858,
415
  "step": 135
416
  },
417
  {
418
+ "epoch": 0.8595548733691482,
419
+ "grad_norm": 26.716783170877356,
420
+ "learning_rate": 3.0258899676375407e-07,
421
+ "logits/chosen": -2.226757764816284,
422
+ "logits/rejected": -2.0386719703674316,
423
+ "logps/chosen": -330.4624938964844,
424
+ "logps/rejected": -520.3562622070312,
425
+ "loss": 0.2945,
426
+ "rewards/accuracies": 0.9312499761581421,
427
+ "rewards/chosen": 1.6619141101837158,
428
+ "rewards/margins": 3.725292921066284,
429
+ "rewards/rejected": -2.062939405441284,
430
  "step": 140
431
  },
432
  {
433
+ "epoch": 0.8902532617037605,
434
+ "grad_norm": 28.642400942966432,
435
+ "learning_rate": 2.944983818770226e-07,
436
+ "logits/chosen": -2.2412109375,
437
+ "logits/rejected": -2.065624952316284,
438
+ "logps/chosen": -288.29998779296875,
439
+ "logps/rejected": -337.26873779296875,
440
+ "loss": 0.3528,
441
+ "rewards/accuracies": 0.856249988079071,
442
+ "rewards/chosen": 1.7783691883087158,
443
+ "rewards/margins": 3.4496827125549316,
444
+ "rewards/rejected": -1.66796875,
445
  "step": 145
446
  },
447
  {
448
+ "epoch": 0.920951650038373,
449
+ "grad_norm": 20.19215767876997,
450
+ "learning_rate": 2.8640776699029124e-07,
451
+ "logits/chosen": -2.296093702316284,
452
+ "logits/rejected": -2.0794920921325684,
453
+ "logps/chosen": -275.4468688964844,
454
+ "logps/rejected": -425.65313720703125,
455
+ "loss": 0.2387,
456
+ "rewards/accuracies": 0.9125000238418579,
457
+ "rewards/chosen": 2.306689500808716,
458
+ "rewards/margins": 4.460400581359863,
459
+ "rewards/rejected": -2.156982421875,
460
  "step": 150
461
  },
462
  {
463
+ "epoch": 0.9516500383729855,
464
+ "grad_norm": 18.41603462189631,
465
+ "learning_rate": 2.783171521035599e-07,
466
+ "logits/chosen": -2.2021484375,
467
+ "logits/rejected": -2.01953125,
468
+ "logps/chosen": -431.5687561035156,
469
+ "logps/rejected": -383.07501220703125,
470
+ "loss": 0.3324,
471
+ "rewards/accuracies": 0.887499988079071,
472
+ "rewards/chosen": 2.0430665016174316,
473
+ "rewards/margins": 4.26806640625,
474
+ "rewards/rejected": -2.2254638671875,
475
  "step": 155
476
  },
477
  {
478
+ "epoch": 0.9823484267075978,
479
+ "grad_norm": 22.4727442260727,
480
+ "learning_rate": 2.7022653721682846e-07,
481
+ "logits/chosen": -2.190234422683716,
482
+ "logits/rejected": -2.022265672683716,
483
+ "logps/chosen": -301.4593811035156,
484
+ "logps/rejected": -401.4593811035156,
485
+ "loss": 0.2658,
486
+ "rewards/accuracies": 0.856249988079071,
487
+ "rewards/chosen": 2.146771192550659,
488
+ "rewards/margins": 4.336230278015137,
489
+ "rewards/rejected": -2.1904234886169434,
490
  "step": 160
491
  },
492
  {
493
+ "epoch": 1.012279355333845,
494
+ "grad_norm": 29.045746191502413,
495
+ "learning_rate": 2.6213592233009707e-07,
496
+ "logits/chosen": -2.2520031929016113,
497
+ "logits/rejected": -1.9837740659713745,
498
+ "logps/chosen": -268.5384521484375,
499
+ "logps/rejected": -455.69232177734375,
500
+ "loss": 0.2525,
501
+ "rewards/accuracies": 0.9081196188926697,
502
+ "rewards/chosen": 2.257117748260498,
503
+ "rewards/margins": 4.123847961425781,
504
+ "rewards/rejected": -1.8706743717193604,
505
  "step": 165
506
  },
507
  {
508
+ "epoch": 1.0429777436684573,
509
+ "grad_norm": 41.53269802898524,
510
+ "learning_rate": 2.540453074433657e-07,
511
+ "logits/chosen": -2.285937547683716,
512
+ "logits/rejected": -2.0611329078674316,
513
+ "logps/chosen": -335.42498779296875,
514
+ "logps/rejected": -666.0999755859375,
515
+ "loss": 0.2565,
516
+ "rewards/accuracies": 0.9125000238418579,
517
+ "rewards/chosen": 2.12225341796875,
518
+ "rewards/margins": 4.858691215515137,
519
+ "rewards/rejected": -2.73529052734375,
520
  "step": 170
521
  },
522
  {
523
+ "epoch": 1.0736761320030699,
524
+ "grad_norm": 5.165992871948705,
525
+ "learning_rate": 2.459546925566343e-07,
526
+ "logits/chosen": -2.252148389816284,
527
+ "logits/rejected": -2.0345702171325684,
528
+ "logps/chosen": -309.3500061035156,
529
+ "logps/rejected": -519.1593627929688,
530
+ "loss": 0.2073,
531
+ "rewards/accuracies": 0.9125000238418579,
532
+ "rewards/chosen": 2.195666551589966,
533
+ "rewards/margins": 4.653515815734863,
534
+ "rewards/rejected": -2.455151319503784,
535
  "step": 175
536
  },
537
  {
538
+ "epoch": 1.1043745203376822,
539
+ "grad_norm": 7.172517838598237,
540
+ "learning_rate": 2.378640776699029e-07,
541
+ "logits/chosen": -2.219921827316284,
542
+ "logits/rejected": -2.071093797683716,
543
+ "logps/chosen": -306.23126220703125,
544
+ "logps/rejected": -418.01251220703125,
545
+ "loss": 0.2665,
546
+ "rewards/accuracies": 0.925000011920929,
547
+ "rewards/chosen": 2.138867139816284,
548
+ "rewards/margins": 4.50830078125,
549
+ "rewards/rejected": -2.36920166015625,
550
  "step": 180
551
  },
552
  {
553
+ "epoch": 1.1350729086722948,
554
+ "grad_norm": 10.018114423357114,
555
+ "learning_rate": 2.297734627831715e-07,
556
+ "logits/chosen": -2.275195360183716,
557
+ "logits/rejected": -2.061328172683716,
558
+ "logps/chosen": -350.2749938964844,
559
+ "logps/rejected": -652.1687622070312,
560
+ "loss": 0.2009,
561
+ "rewards/accuracies": 0.918749988079071,
562
+ "rewards/chosen": 1.891626000404358,
563
+ "rewards/margins": 4.262890815734863,
564
+ "rewards/rejected": -2.3704590797424316,
565
  "step": 185
566
  },
567
  {
568
+ "epoch": 1.1657712970069072,
569
+ "grad_norm": 20.436381868896863,
570
+ "learning_rate": 2.2168284789644013e-07,
571
+ "logits/chosen": -2.26171875,
572
+ "logits/rejected": -2.015429735183716,
573
+ "logps/chosen": -372.1812438964844,
574
+ "logps/rejected": -343.4375,
575
+ "loss": 0.266,
576
+ "rewards/accuracies": 0.893750011920929,
577
+ "rewards/chosen": 1.8646728992462158,
578
+ "rewards/margins": 3.8271484375,
579
+ "rewards/rejected": -1.962133765220642,
580
  "step": 190
581
  },
582
  {
583
+ "epoch": 1.1964696853415195,
584
+ "grad_norm": 6.83979158792521,
585
+ "learning_rate": 2.1359223300970871e-07,
586
+ "logits/chosen": -2.2855467796325684,
587
+ "logits/rejected": -2.160351514816284,
588
+ "logps/chosen": -342.2406311035156,
589
+ "logps/rejected": -460.0093688964844,
590
+ "loss": 0.2465,
591
+ "rewards/accuracies": 0.9125000238418579,
592
+ "rewards/chosen": 1.532690405845642,
593
+ "rewards/margins": 3.607617139816284,
594
+ "rewards/rejected": -2.077587842941284,
595
  "step": 195
596
  },
597
  {
598
+ "epoch": 1.2271680736761321,
599
+ "grad_norm": 10.68098901348123,
600
+ "learning_rate": 2.0550161812297733e-07,
601
+ "logits/chosen": -2.2890625,
602
+ "logits/rejected": -2.1167969703674316,
603
+ "logps/chosen": -305.42657470703125,
604
+ "logps/rejected": -532.4812622070312,
605
+ "loss": 0.2422,
606
+ "rewards/accuracies": 0.9125000238418579,
607
+ "rewards/chosen": 1.8942139148712158,
608
+ "rewards/margins": 3.9365234375,
609
+ "rewards/rejected": -2.0408568382263184,
610
  "step": 200
611
  },
612
  {
613
+ "epoch": 1.2578664620107445,
614
+ "grad_norm": 17.918521637264806,
615
+ "learning_rate": 1.9741100323624594e-07,
616
+ "logits/chosen": -2.105273485183716,
617
+ "logits/rejected": -1.9345703125,
618
+ "logps/chosen": -315.2796936035156,
619
+ "logps/rejected": -413.49688720703125,
620
+ "loss": 0.2416,
621
+ "rewards/accuracies": 0.9125000238418579,
622
+ "rewards/chosen": 2.3636536598205566,
623
+ "rewards/margins": 4.360058784484863,
624
+ "rewards/rejected": -1.9951903820037842,
625
  "step": 205
626
  },
627
  {
628
+ "epoch": 1.2885648503453568,
629
+ "grad_norm": 11.29949994925298,
630
+ "learning_rate": 1.8932038834951455e-07,
631
+ "logits/chosen": -2.263867139816284,
632
+ "logits/rejected": -2.0765624046325684,
633
+ "logps/chosen": -325.53436279296875,
634
+ "logps/rejected": -471.0687561035156,
635
+ "loss": 0.2416,
636
+ "rewards/accuracies": 0.893750011920929,
637
+ "rewards/chosen": 2.2369384765625,
638
+ "rewards/margins": 4.525781154632568,
639
+ "rewards/rejected": -2.2864136695861816,
640
  "step": 210
641
  },
642
  {
643
+ "epoch": 1.3192632386799694,
644
+ "grad_norm": 21.00338870046118,
645
+ "learning_rate": 1.8122977346278319e-07,
646
+ "logits/chosen": -2.228515625,
647
+ "logits/rejected": -1.9753906726837158,
648
+ "logps/chosen": -522.5625,
649
+ "logps/rejected": -537.4312744140625,
650
+ "loss": 0.2119,
651
+ "rewards/accuracies": 0.925000011920929,
652
+ "rewards/chosen": 2.187304735183716,
653
+ "rewards/margins": 4.826367378234863,
654
+ "rewards/rejected": -2.637646436691284,
655
  "step": 215
656
  },
657
  {
658
+ "epoch": 1.3499616270145818,
659
+ "grad_norm": 12.767347147647387,
660
+ "learning_rate": 1.7313915857605177e-07,
661
+ "logits/chosen": -2.2425780296325684,
662
+ "logits/rejected": -2.064453125,
663
+ "logps/chosen": -268.79998779296875,
664
+ "logps/rejected": -459.0062561035156,
665
+ "loss": 0.2299,
666
+ "rewards/accuracies": 0.9125000238418579,
667
+ "rewards/chosen": 2.1002197265625,
668
+ "rewards/margins": 4.092089653015137,
669
+ "rewards/rejected": -1.9898560047149658,
670
  "step": 220
671
  },
672
  {
673
+ "epoch": 1.3806600153491941,
674
+ "grad_norm": 8.681233801902874,
675
+ "learning_rate": 1.6504854368932038e-07,
676
+ "logits/chosen": -2.3003907203674316,
677
+ "logits/rejected": -2.0892577171325684,
678
+ "logps/chosen": -285.6468811035156,
679
+ "logps/rejected": -460.37188720703125,
680
+ "loss": 0.2077,
681
+ "rewards/accuracies": 0.918749988079071,
682
+ "rewards/chosen": 2.2374267578125,
683
+ "rewards/margins": 4.366796970367432,
684
+ "rewards/rejected": -2.1262099742889404,
685
  "step": 225
686
  },
687
  {
688
+ "epoch": 1.4113584036838067,
689
+ "grad_norm": 10.770689073686126,
690
+ "learning_rate": 1.56957928802589e-07,
691
+ "logits/chosen": -2.287109375,
692
+ "logits/rejected": -2.0908203125,
693
+ "logps/chosen": -301.0249938964844,
694
+ "logps/rejected": -290.8500061035156,
695
+ "loss": 0.2641,
696
+ "rewards/accuracies": 0.893750011920929,
697
+ "rewards/chosen": 2.11370849609375,
698
+ "rewards/margins": 4.033984184265137,
699
+ "rewards/rejected": -1.9171874523162842,
700
  "step": 230
701
  },
702
  {
703
+ "epoch": 1.442056792018419,
704
+ "grad_norm": 16.68254445980484,
705
+ "learning_rate": 1.488673139158576e-07,
706
+ "logits/chosen": -2.2152342796325684,
707
+ "logits/rejected": -2.0638670921325684,
708
+ "logps/chosen": -313.85626220703125,
709
+ "logps/rejected": -359.36248779296875,
710
+ "loss": 0.2247,
711
+ "rewards/accuracies": 0.9437500238418579,
712
+ "rewards/chosen": 2.3463501930236816,
713
+ "rewards/margins": 4.303027153015137,
714
+ "rewards/rejected": -1.958032250404358,
715
  "step": 235
716
  },
717
  {
718
+ "epoch": 1.4727551803530314,
719
+ "grad_norm": 11.454199015230472,
720
+ "learning_rate": 1.407766990291262e-07,
721
+ "logits/chosen": -2.189453125,
722
+ "logits/rejected": -1.9638671875,
723
+ "logps/chosen": -293.0093688964844,
724
+ "logps/rejected": -451.98748779296875,
725
+ "loss": 0.2605,
726
  "rewards/accuracies": 0.8812500238418579,
727
+ "rewards/chosen": 2.22021484375,
728
+ "rewards/margins": 4.625683784484863,
729
+ "rewards/rejected": -2.405810594558716,
730
  "step": 240
731
  },
732
  {
733
+ "epoch": 1.503453568687644,
734
+ "grad_norm": 10.811336406980383,
735
+ "learning_rate": 1.3268608414239483e-07,
736
+ "logits/chosen": -2.2994141578674316,
737
+ "logits/rejected": -2.1328125,
738
+ "logps/chosen": -321.2124938964844,
739
+ "logps/rejected": -442.8812561035156,
740
+ "loss": 0.2724,
741
  "rewards/accuracies": 0.8187500238418579,
742
+ "rewards/chosen": 2.0484375953674316,
743
+ "rewards/margins": 4.695569038391113,
744
+ "rewards/rejected": -2.64874267578125,
745
  "step": 245
746
  },
747
  {
748
+ "epoch": 1.5341519570222564,
749
+ "grad_norm": 37.3725127429148,
750
+ "learning_rate": 1.2459546925566344e-07,
751
+ "logits/chosen": -2.2509765625,
752
+ "logits/rejected": -2.047656297683716,
753
+ "logps/chosen": -309.80938720703125,
754
+ "logps/rejected": -368.5718688964844,
755
+ "loss": 0.3105,
756
+ "rewards/accuracies": 0.862500011920929,
757
+ "rewards/chosen": 1.801855444908142,
758
+ "rewards/margins": 3.7202117443084717,
759
+ "rewards/rejected": -1.917608618736267,
760
  "step": 250
761
  },
762
  {
763
+ "epoch": 1.5648503453568687,
764
+ "grad_norm": 12.486658586886827,
765
+ "learning_rate": 1.1650485436893204e-07,
766
+ "logits/chosen": -2.1792969703674316,
767
+ "logits/rejected": -2.007031202316284,
768
+ "logps/chosen": -275.83282470703125,
769
+ "logps/rejected": -367.39373779296875,
770
+ "loss": 0.2105,
771
+ "rewards/accuracies": 0.90625,
772
+ "rewards/chosen": 2.1904053688049316,
773
+ "rewards/margins": 4.875097751617432,
774
+ "rewards/rejected": -2.684741258621216,
775
  "step": 255
776
  },
777
  {
778
+ "epoch": 1.5955487336914813,
779
+ "grad_norm": 8.933287783732904,
780
+ "learning_rate": 1.0841423948220065e-07,
781
+ "logits/chosen": -2.3095703125,
782
+ "logits/rejected": -2.070117235183716,
783
+ "logps/chosen": -303.9671936035156,
784
+ "logps/rejected": -401.88751220703125,
785
+ "loss": 0.2192,
786
+ "rewards/accuracies": 0.918749988079071,
787
+ "rewards/chosen": 2.139721632003784,
788
+ "rewards/margins": 4.379004001617432,
789
+ "rewards/rejected": -2.2349791526794434,
790
  "step": 260
791
  },
792
  {
793
+ "epoch": 1.6262471220260937,
794
+ "grad_norm": 12.093053772689885,
795
+ "learning_rate": 1.0032362459546925e-07,
796
+ "logits/chosen": -2.1998047828674316,
797
+ "logits/rejected": -1.9921875,
798
+ "logps/chosen": -253.14688110351562,
799
+ "logps/rejected": -324.20623779296875,
800
+ "loss": 0.235,
801
+ "rewards/accuracies": 0.8999999761581421,
802
+ "rewards/chosen": 1.9906494617462158,
803
+ "rewards/margins": 4.438378810882568,
804
+ "rewards/rejected": -2.449389696121216,
805
  "step": 265
806
  },
807
  {
808
+ "epoch": 1.656945510360706,
809
+ "grad_norm": 23.45873113976805,
810
+ "learning_rate": 9.223300970873786e-08,
811
+ "logits/chosen": -2.155468702316284,
812
+ "logits/rejected": -1.975000023841858,
813
+ "logps/chosen": -299.0093688964844,
814
+ "logps/rejected": -490.6187438964844,
815
+ "loss": 0.2641,
816
+ "rewards/accuracies": 0.8999999761581421,
817
+ "rewards/chosen": 2.007080078125,
818
+ "rewards/margins": 4.43603515625,
819
+ "rewards/rejected": -2.4329833984375,
820
  "step": 270
821
  },
822
+ {
823
+ "epoch": 1.6876438986953186,
824
+ "grad_norm": 23.008683354113384,
825
+ "learning_rate": 8.414239482200647e-08,
826
+ "logits/chosen": -2.315234422683716,
827
+ "logits/rejected": -2.075000047683716,
828
+ "logps/chosen": -286.8187561035156,
829
+ "logps/rejected": -487.60626220703125,
830
+ "loss": 0.2542,
831
+ "rewards/accuracies": 0.9312499761581421,
832
+ "rewards/chosen": 1.4550018310546875,
833
+ "rewards/margins": 4.000781059265137,
834
+ "rewards/rejected": -2.544116258621216,
835
+ "step": 275
836
+ },
837
+ {
838
+ "epoch": 1.718342287029931,
839
+ "grad_norm": 9.981625176059863,
840
+ "learning_rate": 7.605177993527507e-08,
841
+ "logits/chosen": -2.2710938453674316,
842
+ "logits/rejected": -1.9931640625,
843
+ "logps/chosen": -285.4375,
844
+ "logps/rejected": -408.98748779296875,
845
+ "loss": 0.2375,
846
+ "rewards/accuracies": 0.90625,
847
+ "rewards/chosen": 2.16796875,
848
+ "rewards/margins": 4.263037204742432,
849
+ "rewards/rejected": -2.098034620285034,
850
+ "step": 280
851
+ },
852
+ {
853
+ "epoch": 1.7490406753645433,
854
+ "grad_norm": 16.26545065787498,
855
+ "learning_rate": 6.796116504854368e-08,
856
+ "logits/chosen": -2.2212891578674316,
857
+ "logits/rejected": -2.076367139816284,
858
+ "logps/chosen": -258.25701904296875,
859
+ "logps/rejected": -434.1187438964844,
860
+ "loss": 0.2327,
861
+ "rewards/accuracies": 0.8999999761581421,
862
+ "rewards/chosen": 2.1130614280700684,
863
+ "rewards/margins": 4.233984470367432,
864
+ "rewards/rejected": -2.1209349632263184,
865
+ "step": 285
866
+ },
867
+ {
868
+ "epoch": 1.779739063699156,
869
+ "grad_norm": 27.371726210067475,
870
+ "learning_rate": 5.987055016181229e-08,
871
+ "logits/chosen": -2.241406202316284,
872
+ "logits/rejected": -2.0464844703674316,
873
+ "logps/chosen": -269.08123779296875,
874
+ "logps/rejected": -387.34375,
875
+ "loss": 0.1835,
876
+ "rewards/accuracies": 0.9375,
877
+ "rewards/chosen": 2.199023485183716,
878
+ "rewards/margins": 4.75390625,
879
+ "rewards/rejected": -2.554003953933716,
880
+ "step": 290
881
+ },
882
+ {
883
+ "epoch": 1.8104374520337683,
884
+ "grad_norm": 15.812358238339476,
885
+ "learning_rate": 5.1779935275080905e-08,
886
+ "logits/chosen": -2.2125000953674316,
887
+ "logits/rejected": -2.0335936546325684,
888
+ "logps/chosen": -239.88125610351562,
889
+ "logps/rejected": -359.8125,
890
+ "loss": 0.2308,
891
+ "rewards/accuracies": 0.8999999761581421,
892
+ "rewards/chosen": 1.8656494617462158,
893
+ "rewards/margins": 4.216113090515137,
894
+ "rewards/rejected": -2.351318359375,
895
+ "step": 295
896
+ },
897
+ {
898
+ "epoch": 1.8411358403683806,
899
+ "grad_norm": 7.50562504957695,
900
+ "learning_rate": 4.3689320388349516e-08,
901
+ "logits/chosen": -2.2437500953674316,
902
+ "logits/rejected": -1.9416015148162842,
903
+ "logps/chosen": -295.25311279296875,
904
+ "logps/rejected": -730.5250244140625,
905
+ "loss": 0.2539,
906
+ "rewards/accuracies": 0.90625,
907
+ "rewards/chosen": 2.0682129859924316,
908
+ "rewards/margins": 4.706347465515137,
909
+ "rewards/rejected": -2.636059522628784,
910
+ "step": 300
911
+ },
912
+ {
913
+ "epoch": 1.8718342287029932,
914
+ "grad_norm": 16.624759938399098,
915
+ "learning_rate": 3.559870550161812e-08,
916
+ "logits/chosen": -2.231250047683716,
917
+ "logits/rejected": -2.038867235183716,
918
+ "logps/chosen": -302.42498779296875,
919
+ "logps/rejected": -507.2562561035156,
920
+ "loss": 0.2846,
921
+ "rewards/accuracies": 0.9125000238418579,
922
+ "rewards/chosen": 1.7378661632537842,
923
+ "rewards/margins": 4.2021484375,
924
+ "rewards/rejected": -2.4639649391174316,
925
+ "step": 305
926
+ },
927
+ {
928
+ "epoch": 1.9025326170376056,
929
+ "grad_norm": 13.017575528579666,
930
+ "learning_rate": 2.750809061488673e-08,
931
+ "logits/chosen": -2.2279295921325684,
932
+ "logits/rejected": -1.9718749523162842,
933
+ "logps/chosen": -267.7437438964844,
934
+ "logps/rejected": -451.09844970703125,
935
+ "loss": 0.265,
936
+ "rewards/accuracies": 0.9125000238418579,
937
+ "rewards/chosen": 1.897705078125,
938
+ "rewards/margins": 4.704199314117432,
939
+ "rewards/rejected": -2.805896043777466,
940
+ "step": 310
941
+ },
942
+ {
943
+ "epoch": 1.933231005372218,
944
+ "grad_norm": 14.759142969719154,
945
+ "learning_rate": 1.9417475728155338e-08,
946
+ "logits/chosen": -2.226367235183716,
947
+ "logits/rejected": -2.0074219703674316,
948
+ "logps/chosen": -287.78125,
949
+ "logps/rejected": -434.41876220703125,
950
+ "loss": 0.2342,
951
+ "rewards/accuracies": 0.8812500238418579,
952
+ "rewards/chosen": 1.9349365234375,
953
+ "rewards/margins": 4.436230659484863,
954
+ "rewards/rejected": -2.49725341796875,
955
+ "step": 315
956
+ },
957
+ {
958
+ "epoch": 1.9639293937068305,
959
+ "grad_norm": 5.502511576422595,
960
+ "learning_rate": 1.1326860841423949e-08,
961
+ "logits/chosen": -2.2650389671325684,
962
+ "logits/rejected": -2.0667967796325684,
963
+ "logps/chosen": -285.3687438964844,
964
+ "logps/rejected": -342.23126220703125,
965
+ "loss": 0.1939,
966
+ "rewards/accuracies": 0.925000011920929,
967
+ "rewards/chosen": 2.207592725753784,
968
+ "rewards/margins": 4.591406345367432,
969
+ "rewards/rejected": -2.382006883621216,
970
+ "step": 320
971
+ },
972
+ {
973
+ "epoch": 1.9946277820414429,
974
+ "grad_norm": 22.06160763333907,
975
+ "learning_rate": 3.2362459546925565e-09,
976
+ "logits/chosen": -2.13671875,
977
+ "logits/rejected": -2.0425782203674316,
978
+ "logps/chosen": -295.52813720703125,
979
+ "logps/rejected": -390.890625,
980
+ "loss": 0.2874,
981
+ "rewards/accuracies": 0.8999999761581421,
982
+ "rewards/chosen": 1.939697265625,
983
+ "rewards/margins": 3.943554639816284,
984
+ "rewards/rejected": -2.001660108566284,
985
+ "step": 325
986
+ },
987
  {
988
  "epoch": 2.0,
989
+ "step": 326,
990
  "total_flos": 0.0,
991
+ "train_loss": 0.2737119815832267,
992
+ "train_runtime": 3862.0227,
993
+ "train_samples_per_second": 2.699,
994
+ "train_steps_per_second": 0.084
995
  }
996
  ],
997
  "logging_steps": 5,
998
+ "max_steps": 326,
999
  "num_input_tokens_seen": 0,
1000
  "num_train_epochs": 2,
1001
  "save_steps": 50,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f59e267c9f87c4b6f165fec3175a328a7f6358c64f7b7474d6d509f2f903bc12
3
  size 7800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b27d9594e9d2be8e17f4cc2533dea3a857edbe23a5da550573190398ac4436
3
  size 7800