pszemraj commited on
Commit
78870ae
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: Qwen/Qwen2-1.5B
4
+ metrics:
5
+ - accuracy
6
+ datasets:
7
+ - BEE-spoke-data/stepbasin-books
8
+ language:
9
+ - en
10
+ ---
11
+
12
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/pszemraj/long-generation-tests/runs/ethp25f9)
13
+ # Qwen2-1.5B-stepbasin-books
14
+
15
+ > [!IMPORTANT]
16
+ > this was finetuned at 16384 context length
17
+
18
+ This is an experiment on long context text **generation** (i.e. 6k+ tokens generated) to evaluate if/when generation breaks down, etc. As such, all the data on which this model has been fine-tuned are full-length books.
19
+ ## Details
20
+
21
+ This model is a fine-tuned version of [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) on https://github.com/stepbasin/books/tree/master/books
22
+
23
+ It achieves the following results on the evaluation set:
24
+ - Loss: 2.8110
25
+ - Accuracy: 0.4298
26
+ - Num Input Tokens Seen: 44040192
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
all_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9899888765294773,
3
+ "eval_accuracy": 0.4297979192055684,
4
+ "eval_loss": 2.810983180999756,
5
+ "eval_runtime": 19.3217,
6
+ "eval_samples": 29,
7
+ "eval_samples_per_second": 1.501,
8
+ "eval_steps_per_second": 1.501,
9
+ "num_input_tokens_seen": 44040192,
10
+ "perplexity": 16.62625680536472,
11
+ "total_flos": 3.462459117703004e+17,
12
+ "train_loss": 2.7097946518943425,
13
+ "train_runtime": 6795.1554,
14
+ "train_samples": 899,
15
+ "train_samples_per_second": 0.397,
16
+ "train_steps_per_second": 0.012
17
+ }
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2-1.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "max_position_embeddings": 131072,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": 131072,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.42.4",
25
+ "use_cache": true,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151936
28
+ }
eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9899888765294773,
3
+ "eval_accuracy": 0.4297979192055684,
4
+ "eval_loss": 2.810983180999756,
5
+ "eval_runtime": 19.3217,
6
+ "eval_samples": 29,
7
+ "eval_samples_per_second": 1.501,
8
+ "eval_steps_per_second": 1.501,
9
+ "num_input_tokens_seen": 44040192,
10
+ "perplexity": 16.62625680536472
11
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 8192,
5
+ "do_sample": true,
6
+ "repetition_penalty": 1.04,
7
+ "renormalize_logits": true,
8
+ "no_repeat_ngram_size": 4,
9
+ "epsilon_cutoff": 1e-5
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f0ee7d4553edd7103ba253a2eb1ac64e606af2fa72d726b8aaabd8461dd847b
3
+ size 3087467144
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "<|endoftext|>",
36
+ "errors": "replace",
37
+ "model_max_length": 32768,
38
+ "pad_token": "<|endoftext|>",
39
+ "split_special_tokens": false,
40
+ "tokenizer_class": "Qwen2Tokenizer",
41
+ "unk_token": null
42
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9899888765294773,
3
+ "num_input_tokens_seen": 44040192,
4
+ "total_flos": 3.462459117703004e+17,
5
+ "train_loss": 2.7097946518943425,
6
+ "train_runtime": 6795.1554,
7
+ "train_samples": 899,
8
+ "train_samples_per_second": 0.397,
9
+ "train_steps_per_second": 0.012
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9899888765294773,
5
+ "eval_steps": 300.0,
6
+ "global_step": 84,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1067853170189099,
13
+ "grad_norm": 0.5859375,
14
+ "learning_rate": 1.8e-05,
15
+ "loss": 2.8121,
16
+ "num_input_tokens_seen": 1572864,
17
+ "step": 3
18
+ },
19
+ {
20
+ "epoch": 0.2135706340378198,
21
+ "grad_norm": 0.69921875,
22
+ "learning_rate": 2.9988140958409528e-05,
23
+ "loss": 2.8211,
24
+ "num_input_tokens_seen": 3145728,
25
+ "step": 6
26
+ },
27
+ {
28
+ "epoch": 0.3203559510567297,
29
+ "grad_norm": 0.796875,
30
+ "learning_rate": 2.9810630129045003e-05,
31
+ "loss": 2.7286,
32
+ "num_input_tokens_seen": 4718592,
33
+ "step": 9
34
+ },
35
+ {
36
+ "epoch": 0.4271412680756396,
37
+ "grad_norm": 0.71875,
38
+ "learning_rate": 2.9422573564911305e-05,
39
+ "loss": 2.7688,
40
+ "num_input_tokens_seen": 6291456,
41
+ "step": 12
42
+ },
43
+ {
44
+ "epoch": 0.5339265850945495,
45
+ "grad_norm": 0.58984375,
46
+ "learning_rate": 2.88294878209231e-05,
47
+ "loss": 2.756,
48
+ "num_input_tokens_seen": 7864320,
49
+ "step": 15
50
+ },
51
+ {
52
+ "epoch": 0.6407119021134594,
53
+ "grad_norm": 0.54296875,
54
+ "learning_rate": 2.8039804116593743e-05,
55
+ "loss": 2.7252,
56
+ "num_input_tokens_seen": 9437184,
57
+ "step": 18
58
+ },
59
+ {
60
+ "epoch": 0.7474972191323693,
61
+ "grad_norm": 0.54296875,
62
+ "learning_rate": 2.7064748479061476e-05,
63
+ "loss": 2.7188,
64
+ "num_input_tokens_seen": 11010048,
65
+ "step": 21
66
+ },
67
+ {
68
+ "epoch": 0.8542825361512792,
69
+ "grad_norm": 0.484375,
70
+ "learning_rate": 2.5918182155542415e-05,
71
+ "loss": 2.7574,
72
+ "num_input_tokens_seen": 12582912,
73
+ "step": 24
74
+ },
75
+ {
76
+ "epoch": 0.9610678531701891,
77
+ "grad_norm": 0.5,
78
+ "learning_rate": 2.4616404563883302e-05,
79
+ "loss": 2.7792,
80
+ "num_input_tokens_seen": 14155776,
81
+ "step": 27
82
+ },
83
+ {
84
+ "epoch": 0.996662958843159,
85
+ "eval_accuracy": 0.4287265815910942,
86
+ "eval_loss": 2.818293809890747,
87
+ "eval_runtime": 19.7558,
88
+ "eval_samples_per_second": 1.468,
89
+ "eval_steps_per_second": 1.468,
90
+ "num_input_tokens_seen": 14729216,
91
+ "step": 28
92
+ },
93
+ {
94
+ "epoch": 1.067853170189099,
95
+ "grad_norm": 0.50390625,
96
+ "learning_rate": 2.3177921582440015e-05,
97
+ "loss": 2.6953,
98
+ "num_input_tokens_seen": 15728640,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 1.174638487208009,
103
+ "grad_norm": 0.462890625,
104
+ "learning_rate": 2.162318247323868e-05,
105
+ "loss": 2.7098,
106
+ "num_input_tokens_seen": 17301504,
107
+ "step": 33
108
+ },
109
+ {
110
+ "epoch": 1.2814238042269188,
111
+ "grad_norm": 0.451171875,
112
+ "learning_rate": 1.997428917828102e-05,
113
+ "loss": 2.6726,
114
+ "num_input_tokens_seen": 18874368,
115
+ "step": 36
116
+ },
117
+ {
118
+ "epoch": 1.3882091212458287,
119
+ "grad_norm": 0.462890625,
120
+ "learning_rate": 1.825468212159477e-05,
121
+ "loss": 2.6655,
122
+ "num_input_tokens_seen": 20447232,
123
+ "step": 39
124
+ },
125
+ {
126
+ "epoch": 1.4949944382647387,
127
+ "grad_norm": 0.455078125,
128
+ "learning_rate": 1.6488806983620927e-05,
129
+ "loss": 2.7292,
130
+ "num_input_tokens_seen": 22020096,
131
+ "step": 42
132
+ },
133
+ {
134
+ "epoch": 1.6017797552836486,
135
+ "grad_norm": 0.4453125,
136
+ "learning_rate": 1.4701767185023948e-05,
137
+ "loss": 2.6312,
138
+ "num_input_tokens_seen": 23592960,
139
+ "step": 45
140
+ },
141
+ {
142
+ "epoch": 1.7085650723025583,
143
+ "grad_norm": 0.43359375,
144
+ "learning_rate": 1.2918967020163978e-05,
145
+ "loss": 2.7176,
146
+ "num_input_tokens_seen": 25165824,
147
+ "step": 48
148
+ },
149
+ {
150
+ "epoch": 1.8153503893214684,
151
+ "grad_norm": 0.4453125,
152
+ "learning_rate": 1.116575051339288e-05,
153
+ "loss": 2.7234,
154
+ "num_input_tokens_seen": 26738688,
155
+ "step": 51
156
+ },
157
+ {
158
+ "epoch": 1.9221357063403781,
159
+ "grad_norm": 0.4375,
160
+ "learning_rate": 9.467041132139884e-06,
161
+ "loss": 2.6971,
162
+ "num_input_tokens_seen": 28311552,
163
+ "step": 54
164
+ },
165
+ {
166
+ "epoch": 1.993325917686318,
167
+ "eval_accuracy": 0.4297116228554831,
168
+ "eval_loss": 2.8111748695373535,
169
+ "eval_runtime": 19.5837,
170
+ "eval_samples_per_second": 1.481,
171
+ "eval_steps_per_second": 1.481,
172
+ "num_input_tokens_seen": 29458432,
173
+ "step": 56
174
+ },
175
+ {
176
+ "epoch": 2.0289210233592883,
177
+ "grad_norm": 0.41796875,
178
+ "learning_rate": 7.846987478572411e-06,
179
+ "loss": 2.6623,
180
+ "num_input_tokens_seen": 29884416,
181
+ "step": 57
182
+ },
183
+ {
184
+ "epoch": 2.135706340378198,
185
+ "grad_norm": 0.396484375,
186
+ "learning_rate": 6.328619996627272e-06,
187
+ "loss": 2.6211,
188
+ "num_input_tokens_seen": 31457280,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 2.242491657397108,
193
+ "grad_norm": 0.416015625,
194
+ "learning_rate": 4.933523574614447e-06,
195
+ "loss": 2.6728,
196
+ "num_input_tokens_seen": 33030144,
197
+ "step": 63
198
+ },
199
+ {
200
+ "epoch": 2.349276974416018,
201
+ "grad_norm": 0.41015625,
202
+ "learning_rate": 3.6815306976265466e-06,
203
+ "loss": 2.6567,
204
+ "num_input_tokens_seen": 34603008,
205
+ "step": 66
206
+ },
207
+ {
208
+ "epoch": 2.456062291434928,
209
+ "grad_norm": 0.435546875,
210
+ "learning_rate": 2.590439511854144e-06,
211
+ "loss": 2.6862,
212
+ "num_input_tokens_seen": 36175872,
213
+ "step": 69
214
+ },
215
+ {
216
+ "epoch": 2.5628476084538376,
217
+ "grad_norm": 0.419921875,
218
+ "learning_rate": 1.6757608087630249e-06,
219
+ "loss": 2.6841,
220
+ "num_input_tokens_seen": 37748736,
221
+ "step": 72
222
+ },
223
+ {
224
+ "epoch": 2.6696329254727473,
225
+ "grad_norm": 0.40625,
226
+ "learning_rate": 9.504975259690835e-07,
227
+ "loss": 2.7272,
228
+ "num_input_tokens_seen": 39321600,
229
+ "step": 75
230
+ },
231
+ {
232
+ "epoch": 2.7764182424916575,
233
+ "grad_norm": 0.40625,
234
+ "learning_rate": 4.2495989939384916e-07,
235
+ "loss": 2.6643,
236
+ "num_input_tokens_seen": 40894464,
237
+ "step": 78
238
+ },
239
+ {
240
+ "epoch": 2.883203559510567,
241
+ "grad_norm": 0.408203125,
242
+ "learning_rate": 1.0661889447039886e-07,
243
+ "loss": 2.679,
244
+ "num_input_tokens_seen": 42467328,
245
+ "step": 81
246
+ },
247
+ {
248
+ "epoch": 2.9899888765294773,
249
+ "grad_norm": 0.412109375,
250
+ "learning_rate": 0.0,
251
+ "loss": 2.7116,
252
+ "num_input_tokens_seen": 44040192,
253
+ "step": 84
254
+ },
255
+ {
256
+ "epoch": 2.9899888765294773,
257
+ "eval_accuracy": 0.4297979192055684,
258
+ "eval_loss": 2.810983180999756,
259
+ "eval_runtime": 19.4794,
260
+ "eval_samples_per_second": 1.489,
261
+ "eval_steps_per_second": 1.489,
262
+ "num_input_tokens_seen": 44040192,
263
+ "step": 84
264
+ },
265
+ {
266
+ "epoch": 2.9899888765294773,
267
+ "num_input_tokens_seen": 44040192,
268
+ "step": 84,
269
+ "total_flos": 3.462459117703004e+17,
270
+ "train_loss": 2.7097946518943425,
271
+ "train_runtime": 6795.1554,
272
+ "train_samples_per_second": 0.397,
273
+ "train_steps_per_second": 0.012
274
+ }
275
+ ],
276
+ "logging_steps": 3,
277
+ "max_steps": 84,
278
+ "num_input_tokens_seen": 44040192,
279
+ "num_train_epochs": 3,
280
+ "save_steps": 50,
281
+ "stateful_callbacks": {
282
+ "TrainerControl": {
283
+ "args": {
284
+ "should_epoch_stop": false,
285
+ "should_evaluate": false,
286
+ "should_log": false,
287
+ "should_save": true,
288
+ "should_training_stop": true
289
+ },
290
+ "attributes": {}
291
+ }
292
+ },
293
+ "total_flos": 3.462459117703004e+17,
294
+ "train_batch_size": 1,
295
+ "trial_name": null,
296
+ "trial_params": null
297
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79753a25dbb8b2565e4d20ffdac00704b8318b52bea001e3e03550986f4e7bf0
3
+ size 5240
vocab.json ADDED
The diff for this file is too large to render. See raw diff