ilacunza commited on
Commit
d513eb6
·
verified ·
1 Parent(s): 31dc204

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ wandb/offline-run-20250402_134739-nwi5m2nq/files/output.log filter=lfs diff=lfs merge=lfs -text
38
+ wandb/offline-run-20250402_134739-nwi5m2nq/run-nwi5m2nq.wandb filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/gpfs/projects/bsc88/text/models/instruction-tuning/models/base_models_with_special_tokens/iberianLLM_7B_xdogeCPT_34k",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 32,
20
+ "num_key_value_heads": 8,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.40.2",
28
+ "use_cache": true,
29
+ "vocab_size": 256000
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.40.2"
6
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b38716c54f507511b7d41e7e1471e5593058a6b093c901d633a0df8ac8bbf327
3
+ size 4982973048
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:232a0aa2eeff088c94f2f783057a5db561c3c027fa4453bffffcadeabe44806e
3
+ size 4995660232
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ec5007d9bd490602f9e5a4f217464b363279d05b8e57761927f7bed326cf76
3
+ size 3460482936
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2219f41f755a621731c33555c7d40b82c8f9a33464a5304fc0f1bc39a7ba0707
3
+ size 2097152128
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15536234496
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
296
+ "model.norm.weight": "model-00003-of-00004.safetensors"
297
+ }
298
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": "<unk>",
21
+ "unk_token": {
22
+ "content": "<unk>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e38708bcb6da1791b245492580b4acc2738ffb0270490b06cd319c5aecafa61
3
+ size 21011862
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3019afe457a8082afa56cb2983f3e7e8ead0655c3c408c13dddcb5984c13e3a
3
+ size 4918006
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "4": {
31
+ "content": "<|im_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "5": {
39
+ "content": "<|im_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ }
46
+ },
47
+ "additional_special_tokens": [
48
+ "<|im_start|>",
49
+ "<|im_end|>"
50
+ ],
51
+ "bos_token": "<s>",
52
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "legacy": true,
56
+ "model_max_length": 8192,
57
+ "pad_token": "<unk>",
58
+ "padding_side": "right",
59
+ "sp_model_kwargs": {},
60
+ "spaces_between_special_tokens": false,
61
+ "tokenizer_class": "LlamaTokenizer",
62
+ "unk_token": "<unk>",
63
+ "use_default_system_prompt": false
64
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Current SDK version is 0.17.0
2
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Configure stats pid to 1538306
3
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Loading settings from /home/bsc/bsc088369/.config/wandb/settings
4
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Loading settings from /gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/wandb/settings
5
+ 2025-04-02 13:47:39,397 WARNING MainThread:1538306 [wandb_setup.py:_flush():76] Unknown environment variable: WANDB_CONFIG_DIR
6
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'root_dir': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0', 'init_timeout': '600', 'project': 'instruction-tuning', 'run_name': 'iberianLLM_7B_xdogeCPT_34k_instruct-v1.0_18503701', 'mode': 'offline'}
7
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
8
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'fastchat/train/train.py', 'program_abspath': '/gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/fastchat/train/train.py', 'program': '/gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/fastchat/train/train.py'}
9
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:_log_setup():520] Logging user logs to /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/wandb/offline-run-20250402_134739-nwi5m2nq/logs/debug.log
10
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:_log_setup():521] Logging internal logs to /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/wandb/offline-run-20250402_134739-nwi5m2nq/logs/debug-internal.log
11
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():560] calling init triggers
12
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():610] starting backend
15
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():614] setting up manager
16
+ 2025-04-02 13:47:39,398 INFO MainThread:1538306 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2025-04-02 13:47:39,400 INFO MainThread:1538306 [wandb_init.py:init():622] backend started and connected
18
+ 2025-04-02 13:47:39,403 INFO MainThread:1538306 [wandb_init.py:init():711] updated telemetry
19
+ 2025-04-02 13:47:39,500 INFO MainThread:1538306 [wandb_init.py:init():744] communicating run to backend with 600.0 second timeout
20
+ 2025-04-02 13:47:39,504 INFO MainThread:1538306 [wandb_init.py:init():795] starting run threads in backend
21
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_console_start():2374] atexit reg
22
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_redirect():2229] redirect: wrap_raw
23
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_redirect():2294] Wrapping output streams.
24
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_redirect():2319] Redirects installed.
25
+ 2025-04-02 13:47:40,892 INFO MainThread:1538306 [wandb_init.py:init():838] run started, returning control to user process
26
+ 2025-04-02 13:47:40,893 INFO MainThread:1538306 [wandb_run.py:_config_callback():1376] config_cb None None {'vocab_size': 256000, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/base_models_with_special_tokens/iberianLLM_7B_xdogeCPT_34k', 'transformers_version': '4.40.2', 'head_dim': 128, 'mlp_bias': False, 'model_type': 'llama', 'output_dir': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 1e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/runs/Apr02_13-46-41_as03r3b01', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 15, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 0.01, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_configs/ds_type3_config_autombs.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': 5.0, 'optim_target_modules': None, 'cache_dir': None, 'model_max_length': 8192}
27
+ 2025-04-03 05:48:27,018 WARNING MsgRouterThr:1538306 [router.py:message_loop():77] message_loop has been closed
wandb/offline-run-20250402_134739-nwi5m2nq/files/config.yaml ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.9.16
7
+ cli_version: 0.17.0
8
+ framework: huggingface
9
+ huggingface_version: 4.40.2
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1743594459
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 49
18
+ - 51
19
+ - 55
20
+ - 71
21
+ - 98
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 51
28
+ - 55
29
+ - 71
30
+ - 98
31
+ - 105
32
+ 3:
33
+ - 4
34
+ - 7
35
+ - 23
36
+ - 37
37
+ - 42
38
+ - 62
39
+ - 66
40
+ 4: 3.9.16
41
+ 5: 0.17.0
42
+ 6: 4.40.2
43
+ 8:
44
+ - 5
45
+ 9:
46
+ 1: transformers_trainer
47
+ 13: linux-x86_64
48
+ m:
49
+ - 1: train/global_step
50
+ 6:
51
+ - 3
52
+ vocab_size:
53
+ desc: null
54
+ value: 256000
55
+ max_position_embeddings:
56
+ desc: null
57
+ value: 8192
58
+ hidden_size:
59
+ desc: null
60
+ value: 4096
61
+ intermediate_size:
62
+ desc: null
63
+ value: 11008
64
+ num_hidden_layers:
65
+ desc: null
66
+ value: 32
67
+ num_attention_heads:
68
+ desc: null
69
+ value: 32
70
+ num_key_value_heads:
71
+ desc: null
72
+ value: 8
73
+ hidden_act:
74
+ desc: null
75
+ value: silu
76
+ initializer_range:
77
+ desc: null
78
+ value: 0.02
79
+ rms_norm_eps:
80
+ desc: null
81
+ value: 1.0e-05
82
+ pretraining_tp:
83
+ desc: null
84
+ value: 1
85
+ use_cache:
86
+ desc: null
87
+ value: false
88
+ rope_theta:
89
+ desc: null
90
+ value: 10000.0
91
+ rope_scaling:
92
+ desc: null
93
+ value: null
94
+ attention_bias:
95
+ desc: null
96
+ value: false
97
+ attention_dropout:
98
+ desc: null
99
+ value: 0.0
100
+ return_dict:
101
+ desc: null
102
+ value: true
103
+ output_hidden_states:
104
+ desc: null
105
+ value: false
106
+ output_attentions:
107
+ desc: null
108
+ value: false
109
+ torchscript:
110
+ desc: null
111
+ value: false
112
+ torch_dtype:
113
+ desc: null
114
+ value: bfloat16
115
+ use_bfloat16:
116
+ desc: null
117
+ value: false
118
+ tf_legacy_loss:
119
+ desc: null
120
+ value: false
121
+ pruned_heads:
122
+ desc: null
123
+ value: {}
124
+ tie_word_embeddings:
125
+ desc: null
126
+ value: false
127
+ chunk_size_feed_forward:
128
+ desc: null
129
+ value: 0
130
+ is_encoder_decoder:
131
+ desc: null
132
+ value: false
133
+ is_decoder:
134
+ desc: null
135
+ value: false
136
+ cross_attention_hidden_size:
137
+ desc: null
138
+ value: null
139
+ add_cross_attention:
140
+ desc: null
141
+ value: false
142
+ tie_encoder_decoder:
143
+ desc: null
144
+ value: false
145
+ max_length:
146
+ desc: null
147
+ value: 20
148
+ min_length:
149
+ desc: null
150
+ value: 0
151
+ do_sample:
152
+ desc: null
153
+ value: false
154
+ early_stopping:
155
+ desc: null
156
+ value: false
157
+ num_beams:
158
+ desc: null
159
+ value: 1
160
+ num_beam_groups:
161
+ desc: null
162
+ value: 1
163
+ diversity_penalty:
164
+ desc: null
165
+ value: 0.0
166
+ temperature:
167
+ desc: null
168
+ value: 1.0
169
+ top_k:
170
+ desc: null
171
+ value: 50
172
+ top_p:
173
+ desc: null
174
+ value: 1.0
175
+ typical_p:
176
+ desc: null
177
+ value: 1.0
178
+ repetition_penalty:
179
+ desc: null
180
+ value: 1.0
181
+ length_penalty:
182
+ desc: null
183
+ value: 1.0
184
+ no_repeat_ngram_size:
185
+ desc: null
186
+ value: 0
187
+ encoder_no_repeat_ngram_size:
188
+ desc: null
189
+ value: 0
190
+ bad_words_ids:
191
+ desc: null
192
+ value: null
193
+ num_return_sequences:
194
+ desc: null
195
+ value: 1
196
+ output_scores:
197
+ desc: null
198
+ value: false
199
+ return_dict_in_generate:
200
+ desc: null
201
+ value: false
202
+ forced_bos_token_id:
203
+ desc: null
204
+ value: null
205
+ forced_eos_token_id:
206
+ desc: null
207
+ value: null
208
+ remove_invalid_values:
209
+ desc: null
210
+ value: false
211
+ exponential_decay_length_penalty:
212
+ desc: null
213
+ value: null
214
+ suppress_tokens:
215
+ desc: null
216
+ value: null
217
+ begin_suppress_tokens:
218
+ desc: null
219
+ value: null
220
+ architectures:
221
+ desc: null
222
+ value:
223
+ - LlamaForCausalLM
224
+ finetuning_task:
225
+ desc: null
226
+ value: null
227
+ id2label:
228
+ desc: null
229
+ value:
230
+ '0': LABEL_0
231
+ '1': LABEL_1
232
+ label2id:
233
+ desc: null
234
+ value:
235
+ LABEL_0: 0
236
+ LABEL_1: 1
237
+ tokenizer_class:
238
+ desc: null
239
+ value: null
240
+ prefix:
241
+ desc: null
242
+ value: null
243
+ bos_token_id:
244
+ desc: null
245
+ value: 1
246
+ pad_token_id:
247
+ desc: null
248
+ value: null
249
+ eos_token_id:
250
+ desc: null
251
+ value: 2
252
+ sep_token_id:
253
+ desc: null
254
+ value: null
255
+ decoder_start_token_id:
256
+ desc: null
257
+ value: null
258
+ task_specific_params:
259
+ desc: null
260
+ value: null
261
+ problem_type:
262
+ desc: null
263
+ value: null
264
+ _name_or_path:
265
+ desc: null
266
+ value: /gpfs/projects/bsc88/text/models/instruction-tuning/models/base_models_with_special_tokens/iberianLLM_7B_xdogeCPT_34k
267
+ transformers_version:
268
+ desc: null
269
+ value: 4.40.2
270
+ head_dim:
271
+ desc: null
272
+ value: 128
273
+ mlp_bias:
274
+ desc: null
275
+ value: false
276
+ model_type:
277
+ desc: null
278
+ value: llama
279
+ output_dir:
280
+ desc: null
281
+ value: /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0
282
+ overwrite_output_dir:
283
+ desc: null
284
+ value: false
285
+ do_train:
286
+ desc: null
287
+ value: false
288
+ do_eval:
289
+ desc: null
290
+ value: true
291
+ do_predict:
292
+ desc: null
293
+ value: false
294
+ evaluation_strategy:
295
+ desc: null
296
+ value: steps
297
+ prediction_loss_only:
298
+ desc: null
299
+ value: false
300
+ per_device_train_batch_size:
301
+ desc: null
302
+ value: 1
303
+ per_device_eval_batch_size:
304
+ desc: null
305
+ value: 2
306
+ per_gpu_train_batch_size:
307
+ desc: null
308
+ value: null
309
+ per_gpu_eval_batch_size:
310
+ desc: null
311
+ value: null
312
+ gradient_accumulation_steps:
313
+ desc: null
314
+ value: 8
315
+ eval_accumulation_steps:
316
+ desc: null
317
+ value: null
318
+ eval_delay:
319
+ desc: null
320
+ value: 0
321
+ learning_rate:
322
+ desc: null
323
+ value: 1.0e-05
324
+ weight_decay:
325
+ desc: null
326
+ value: 0.0
327
+ adam_beta1:
328
+ desc: null
329
+ value: 0.9
330
+ adam_beta2:
331
+ desc: null
332
+ value: 0.999
333
+ adam_epsilon:
334
+ desc: null
335
+ value: 1.0e-08
336
+ max_grad_norm:
337
+ desc: null
338
+ value: 1.0
339
+ num_train_epochs:
340
+ desc: null
341
+ value: 2.0
342
+ max_steps:
343
+ desc: null
344
+ value: -1
345
+ lr_scheduler_type:
346
+ desc: null
347
+ value: cosine
348
+ lr_scheduler_kwargs:
349
+ desc: null
350
+ value: {}
351
+ warmup_ratio:
352
+ desc: null
353
+ value: 0.03
354
+ warmup_steps:
355
+ desc: null
356
+ value: 0
357
+ log_level:
358
+ desc: null
359
+ value: passive
360
+ log_level_replica:
361
+ desc: null
362
+ value: warning
363
+ log_on_each_node:
364
+ desc: null
365
+ value: true
366
+ logging_dir:
367
+ desc: null
368
+ value: /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/runs/Apr02_13-46-41_as03r3b01
369
+ logging_strategy:
370
+ desc: null
371
+ value: steps
372
+ logging_first_step:
373
+ desc: null
374
+ value: false
375
+ logging_steps:
376
+ desc: null
377
+ value: 1.0
378
+ logging_nan_inf_filter:
379
+ desc: null
380
+ value: true
381
+ save_strategy:
382
+ desc: null
383
+ value: epoch
384
+ save_steps:
385
+ desc: null
386
+ value: 500
387
+ save_total_limit:
388
+ desc: null
389
+ value: 15
390
+ save_safetensors:
391
+ desc: null
392
+ value: true
393
+ save_on_each_node:
394
+ desc: null
395
+ value: false
396
+ save_only_model:
397
+ desc: null
398
+ value: false
399
+ no_cuda:
400
+ desc: null
401
+ value: false
402
+ use_cpu:
403
+ desc: null
404
+ value: false
405
+ use_mps_device:
406
+ desc: null
407
+ value: false
408
+ seed:
409
+ desc: null
410
+ value: 42
411
+ data_seed:
412
+ desc: null
413
+ value: null
414
+ jit_mode_eval:
415
+ desc: null
416
+ value: false
417
+ use_ipex:
418
+ desc: null
419
+ value: false
420
+ bf16:
421
+ desc: null
422
+ value: true
423
+ fp16:
424
+ desc: null
425
+ value: false
426
+ fp16_opt_level:
427
+ desc: null
428
+ value: O1
429
+ half_precision_backend:
430
+ desc: null
431
+ value: auto
432
+ bf16_full_eval:
433
+ desc: null
434
+ value: false
435
+ fp16_full_eval:
436
+ desc: null
437
+ value: false
438
+ tf32:
439
+ desc: null
440
+ value: null
441
+ local_rank:
442
+ desc: null
443
+ value: 0
444
+ ddp_backend:
445
+ desc: null
446
+ value: null
447
+ tpu_num_cores:
448
+ desc: null
449
+ value: null
450
+ tpu_metrics_debug:
451
+ desc: null
452
+ value: false
453
+ debug:
454
+ desc: null
455
+ value: []
456
+ dataloader_drop_last:
457
+ desc: null
458
+ value: false
459
+ eval_steps:
460
+ desc: null
461
+ value: 0.01
462
+ dataloader_num_workers:
463
+ desc: null
464
+ value: 0
465
+ dataloader_prefetch_factor:
466
+ desc: null
467
+ value: null
468
+ past_index:
469
+ desc: null
470
+ value: -1
471
+ run_name:
472
+ desc: null
473
+ value: /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0
474
+ disable_tqdm:
475
+ desc: null
476
+ value: false
477
+ remove_unused_columns:
478
+ desc: null
479
+ value: true
480
+ label_names:
481
+ desc: null
482
+ value: null
483
+ load_best_model_at_end:
484
+ desc: null
485
+ value: false
486
+ metric_for_best_model:
487
+ desc: null
488
+ value: null
489
+ greater_is_better:
490
+ desc: null
491
+ value: null
492
+ ignore_data_skip:
493
+ desc: null
494
+ value: false
495
+ fsdp:
496
+ desc: null
497
+ value: []
498
+ fsdp_min_num_params:
499
+ desc: null
500
+ value: 0
501
+ fsdp_config:
502
+ desc: null
503
+ value:
504
+ min_num_params: 0
505
+ xla: false
506
+ xla_fsdp_v2: false
507
+ xla_fsdp_grad_ckpt: false
508
+ fsdp_transformer_layer_cls_to_wrap:
509
+ desc: null
510
+ value: null
511
+ accelerator_config:
512
+ desc: null
513
+ value:
514
+ split_batches: false
515
+ dispatch_batches: null
516
+ even_batches: true
517
+ use_seedable_sampler: true
518
+ gradient_accumulation_kwargs: null
519
+ deepspeed:
520
+ desc: null
521
+ value: deepspeed_configs/ds_type3_config_autombs.json
522
+ label_smoothing_factor:
523
+ desc: null
524
+ value: 0.0
525
+ optim:
526
+ desc: null
527
+ value: adamw_torch
528
+ optim_args:
529
+ desc: null
530
+ value: null
531
+ adafactor:
532
+ desc: null
533
+ value: false
534
+ group_by_length:
535
+ desc: null
536
+ value: false
537
+ length_column_name:
538
+ desc: null
539
+ value: length
540
+ report_to:
541
+ desc: null
542
+ value:
543
+ - wandb
544
+ ddp_find_unused_parameters:
545
+ desc: null
546
+ value: null
547
+ ddp_bucket_cap_mb:
548
+ desc: null
549
+ value: null
550
+ ddp_broadcast_buffers:
551
+ desc: null
552
+ value: null
553
+ dataloader_pin_memory:
554
+ desc: null
555
+ value: true
556
+ dataloader_persistent_workers:
557
+ desc: null
558
+ value: false
559
+ skip_memory_metrics:
560
+ desc: null
561
+ value: true
562
+ use_legacy_prediction_loop:
563
+ desc: null
564
+ value: false
565
+ push_to_hub:
566
+ desc: null
567
+ value: false
568
+ resume_from_checkpoint:
569
+ desc: null
570
+ value: null
571
+ hub_model_id:
572
+ desc: null
573
+ value: null
574
+ hub_strategy:
575
+ desc: null
576
+ value: every_save
577
+ hub_token:
578
+ desc: null
579
+ value: <HUB_TOKEN>
580
+ hub_private_repo:
581
+ desc: null
582
+ value: false
583
+ hub_always_push:
584
+ desc: null
585
+ value: false
586
+ gradient_checkpointing:
587
+ desc: null
588
+ value: true
589
+ gradient_checkpointing_kwargs:
590
+ desc: null
591
+ value: null
592
+ include_inputs_for_metrics:
593
+ desc: null
594
+ value: false
595
+ eval_do_concat_batches:
596
+ desc: null
597
+ value: true
598
+ fp16_backend:
599
+ desc: null
600
+ value: auto
601
+ push_to_hub_model_id:
602
+ desc: null
603
+ value: null
604
+ push_to_hub_organization:
605
+ desc: null
606
+ value: null
607
+ push_to_hub_token:
608
+ desc: null
609
+ value: <PUSH_TO_HUB_TOKEN>
610
+ mp_parameters:
611
+ desc: null
612
+ value: ''
613
+ auto_find_batch_size:
614
+ desc: null
615
+ value: false
616
+ full_determinism:
617
+ desc: null
618
+ value: false
619
+ torchdynamo:
620
+ desc: null
621
+ value: null
622
+ ray_scope:
623
+ desc: null
624
+ value: last
625
+ ddp_timeout:
626
+ desc: null
627
+ value: 1800
628
+ torch_compile:
629
+ desc: null
630
+ value: false
631
+ torch_compile_backend:
632
+ desc: null
633
+ value: null
634
+ torch_compile_mode:
635
+ desc: null
636
+ value: null
637
+ dispatch_batches:
638
+ desc: null
639
+ value: null
640
+ split_batches:
641
+ desc: null
642
+ value: null
643
+ include_tokens_per_second:
644
+ desc: null
645
+ value: false
646
+ include_num_input_tokens_seen:
647
+ desc: null
648
+ value: false
649
+ neftune_noise_alpha:
650
+ desc: null
651
+ value: 5.0
652
+ optim_target_modules:
653
+ desc: null
654
+ value: null
655
+ cache_dir:
656
+ desc: null
657
+ value: null
658
+ model_max_length:
659
+ desc: null
660
+ value: 8192
wandb/offline-run-20250402_134739-nwi5m2nq/files/output.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a248c9fea501a6a130276aafd24f843ac8b443700db305febfa7ef503ec703cd
3
+ size 62978542
wandb/offline-run-20250402_134739-nwi5m2nq/files/wandb-metadata.json ADDED
@@ -0,0 +1,932 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.14.0-284.30.1.el9_2.x86_64-x86_64-with-glibc2.34",
3
+ "python": "3.9.16",
4
+ "heartbeatAt": "2025-04-02T11:47:39.624181",
5
+ "startedAt": "2025-04-02T11:47:39.383806",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--deepspeed",
10
+ "deepspeed_configs/ds_type3_config_autombs.json",
11
+ "--model_name_or_path",
12
+ "/gpfs/projects/bsc88/text/models/instruction-tuning/models/base_models_with_special_tokens/iberianLLM_7B_xdogeCPT_34k",
13
+ "--preprocess_conv_template",
14
+ "bsc_chat_template_system_0.5",
15
+ "--neftune_noise_alpha",
16
+ "5",
17
+ "--data_paths",
18
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_ca-en_v0.1.json",
19
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_ca-es_v0.1.json",
20
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_ca_v0.1.json",
21
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_en-ca_v0.1.json",
22
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_en-es_v0.1.json",
23
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_en-pt_v0.1.json",
24
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_en_v0.1.json",
25
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_es-ca_v0.1.json",
26
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_es-en_v0.1.json",
27
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_es-eu_v0.1.json",
28
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_es-gl_v0.1.json",
29
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_es_v0.1.json",
30
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_eu-es_v0.1.json",
31
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_eu_v0.1.json",
32
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_gl-es_v0.1.json",
33
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_gl_v0.1.json",
34
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_pt-en_v0.1.json",
35
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/exp6.1_sampling_pt_v0.1.json",
36
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/new_alpaca-cleaned_en.json",
37
+ "/gpfs/projects/bsc88/data/04-instruction-samplings/system-prompt-experiments/exp6.1_iberian-sp/new_open-orca-system-prompt_en.json",
38
+ "--bf16",
39
+ "True",
40
+ "--output_dir",
41
+ "/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0",
42
+ "--num_train_epochs",
43
+ "2",
44
+ "--per_device_train_batch_size",
45
+ "1",
46
+ "--per_device_eval_batch_size",
47
+ "2",
48
+ "--gradient_accumulation_steps",
49
+ "8",
50
+ "--evaluation_strategy",
51
+ "steps",
52
+ "--eval_steps",
53
+ "0.01",
54
+ "--save_strategy",
55
+ "epoch",
56
+ "--save_total_limit",
57
+ "15",
58
+ "--learning_rate",
59
+ "1e-5",
60
+ "--weight_decay",
61
+ "0.",
62
+ "--warmup_ratio",
63
+ "0.03",
64
+ "--lr_scheduler_type",
65
+ "cosine",
66
+ "--logging_steps",
67
+ "1",
68
+ "--model_max_length",
69
+ "8192",
70
+ "--gradient_checkpointing",
71
+ "True",
72
+ "--lazy_preprocess",
73
+ "True",
74
+ "--add_chat_template",
75
+ "True",
76
+ "--local_rank",
77
+ "0",
78
+ "--report_to",
79
+ "wandb"
80
+ ],
81
+ "state": "running",
82
+ "program": "/gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/fastchat/train/train.py",
83
+ "codePathLocal": "fastchat/train/train.py",
84
+ "codePath": "fastchat/train/train.py",
85
+ "git": {
86
+ "remote": "https://github.com/langtech-bsc/it-chat-v1.git",
87
+ "commit": "019a772ed26e859b133058d581696f57f0fa613e"
88
+ },
89
+ "email": "[email protected]",
90
+ "root": "/gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1",
91
+ "host": "as03r3b01",
92
+ "username": "bsc088369",
93
+ "executable": "/gpfs/projects/bsc88/text/environments/instruction-tuning_inference_mn5_python3.9_20241107/bin/python",
94
+ "cpu_count": 80,
95
+ "cpu_count_logical": 160,
96
+ "cpu_freq": {
97
+ "current": 3699.94764375,
98
+ "min": 800.0,
99
+ "max": 3700.0
100
+ },
101
+ "cpu_freq_per_core": [
102
+ {
103
+ "current": 3133.761,
104
+ "min": 800.0,
105
+ "max": 3700.0
106
+ },
107
+ {
108
+ "current": 3699.997,
109
+ "min": 800.0,
110
+ "max": 3700.0
111
+ },
112
+ {
113
+ "current": 3700.0,
114
+ "min": 800.0,
115
+ "max": 3700.0
116
+ },
117
+ {
118
+ "current": 3700.0,
119
+ "min": 800.0,
120
+ "max": 3700.0
121
+ },
122
+ {
123
+ "current": 3700.0,
124
+ "min": 800.0,
125
+ "max": 3700.0
126
+ },
127
+ {
128
+ "current": 3700.0,
129
+ "min": 800.0,
130
+ "max": 3700.0
131
+ },
132
+ {
133
+ "current": 3700.0,
134
+ "min": 800.0,
135
+ "max": 3700.0
136
+ },
137
+ {
138
+ "current": 3700.0,
139
+ "min": 800.0,
140
+ "max": 3700.0
141
+ },
142
+ {
143
+ "current": 3700.0,
144
+ "min": 800.0,
145
+ "max": 3700.0
146
+ },
147
+ {
148
+ "current": 3700.0,
149
+ "min": 800.0,
150
+ "max": 3700.0
151
+ },
152
+ {
153
+ "current": 3700.0,
154
+ "min": 800.0,
155
+ "max": 3700.0
156
+ },
157
+ {
158
+ "current": 3700.0,
159
+ "min": 800.0,
160
+ "max": 3700.0
161
+ },
162
+ {
163
+ "current": 3700.0,
164
+ "min": 800.0,
165
+ "max": 3700.0
166
+ },
167
+ {
168
+ "current": 3700.0,
169
+ "min": 800.0,
170
+ "max": 3700.0
171
+ },
172
+ {
173
+ "current": 3700.0,
174
+ "min": 800.0,
175
+ "max": 3700.0
176
+ },
177
+ {
178
+ "current": 3700.0,
179
+ "min": 800.0,
180
+ "max": 3700.0
181
+ },
182
+ {
183
+ "current": 3700.0,
184
+ "min": 800.0,
185
+ "max": 3700.0
186
+ },
187
+ {
188
+ "current": 3700.0,
189
+ "min": 800.0,
190
+ "max": 3700.0
191
+ },
192
+ {
193
+ "current": 3700.0,
194
+ "min": 800.0,
195
+ "max": 3700.0
196
+ },
197
+ {
198
+ "current": 3700.0,
199
+ "min": 800.0,
200
+ "max": 3700.0
201
+ },
202
+ {
203
+ "current": 3700.0,
204
+ "min": 800.0,
205
+ "max": 3700.0
206
+ },
207
+ {
208
+ "current": 3700.0,
209
+ "min": 800.0,
210
+ "max": 3700.0
211
+ },
212
+ {
213
+ "current": 3700.0,
214
+ "min": 800.0,
215
+ "max": 3700.0
216
+ },
217
+ {
218
+ "current": 3700.0,
219
+ "min": 800.0,
220
+ "max": 3700.0
221
+ },
222
+ {
223
+ "current": 3700.0,
224
+ "min": 800.0,
225
+ "max": 3700.0
226
+ },
227
+ {
228
+ "current": 3700.0,
229
+ "min": 800.0,
230
+ "max": 3700.0
231
+ },
232
+ {
233
+ "current": 3700.0,
234
+ "min": 800.0,
235
+ "max": 3700.0
236
+ },
237
+ {
238
+ "current": 3700.0,
239
+ "min": 800.0,
240
+ "max": 3700.0
241
+ },
242
+ {
243
+ "current": 3700.0,
244
+ "min": 800.0,
245
+ "max": 3700.0
246
+ },
247
+ {
248
+ "current": 3700.0,
249
+ "min": 800.0,
250
+ "max": 3700.0
251
+ },
252
+ {
253
+ "current": 3700.0,
254
+ "min": 800.0,
255
+ "max": 3700.0
256
+ },
257
+ {
258
+ "current": 3700.0,
259
+ "min": 800.0,
260
+ "max": 3700.0
261
+ },
262
+ {
263
+ "current": 3700.0,
264
+ "min": 800.0,
265
+ "max": 3700.0
266
+ },
267
+ {
268
+ "current": 3700.0,
269
+ "min": 800.0,
270
+ "max": 3700.0
271
+ },
272
+ {
273
+ "current": 3700.0,
274
+ "min": 800.0,
275
+ "max": 3700.0
276
+ },
277
+ {
278
+ "current": 3700.0,
279
+ "min": 800.0,
280
+ "max": 3700.0
281
+ },
282
+ {
283
+ "current": 3700.0,
284
+ "min": 800.0,
285
+ "max": 3700.0
286
+ },
287
+ {
288
+ "current": 3700.0,
289
+ "min": 800.0,
290
+ "max": 3700.0
291
+ },
292
+ {
293
+ "current": 3700.0,
294
+ "min": 800.0,
295
+ "max": 3700.0
296
+ },
297
+ {
298
+ "current": 3700.0,
299
+ "min": 800.0,
300
+ "max": 3700.0
301
+ },
302
+ {
303
+ "current": 3700.0,
304
+ "min": 800.0,
305
+ "max": 3700.0
306
+ },
307
+ {
308
+ "current": 3700.0,
309
+ "min": 800.0,
310
+ "max": 3700.0
311
+ },
312
+ {
313
+ "current": 3700.0,
314
+ "min": 800.0,
315
+ "max": 3700.0
316
+ },
317
+ {
318
+ "current": 3700.0,
319
+ "min": 800.0,
320
+ "max": 3700.0
321
+ },
322
+ {
323
+ "current": 3700.0,
324
+ "min": 800.0,
325
+ "max": 3700.0
326
+ },
327
+ {
328
+ "current": 3700.0,
329
+ "min": 800.0,
330
+ "max": 3700.0
331
+ },
332
+ {
333
+ "current": 3700.0,
334
+ "min": 800.0,
335
+ "max": 3700.0
336
+ },
337
+ {
338
+ "current": 3700.0,
339
+ "min": 800.0,
340
+ "max": 3700.0
341
+ },
342
+ {
343
+ "current": 3700.0,
344
+ "min": 800.0,
345
+ "max": 3700.0
346
+ },
347
+ {
348
+ "current": 3700.0,
349
+ "min": 800.0,
350
+ "max": 3700.0
351
+ },
352
+ {
353
+ "current": 3700.0,
354
+ "min": 800.0,
355
+ "max": 3700.0
356
+ },
357
+ {
358
+ "current": 3700.0,
359
+ "min": 800.0,
360
+ "max": 3700.0
361
+ },
362
+ {
363
+ "current": 3700.0,
364
+ "min": 800.0,
365
+ "max": 3700.0
366
+ },
367
+ {
368
+ "current": 3700.0,
369
+ "min": 800.0,
370
+ "max": 3700.0
371
+ },
372
+ {
373
+ "current": 3700.0,
374
+ "min": 800.0,
375
+ "max": 3700.0
376
+ },
377
+ {
378
+ "current": 3700.0,
379
+ "min": 800.0,
380
+ "max": 3700.0
381
+ },
382
+ {
383
+ "current": 3699.997,
384
+ "min": 800.0,
385
+ "max": 3700.0
386
+ },
387
+ {
388
+ "current": 3700.0,
389
+ "min": 800.0,
390
+ "max": 3700.0
391
+ },
392
+ {
393
+ "current": 3700.0,
394
+ "min": 800.0,
395
+ "max": 3700.0
396
+ },
397
+ {
398
+ "current": 3700.0,
399
+ "min": 800.0,
400
+ "max": 3700.0
401
+ },
402
+ {
403
+ "current": 3700.0,
404
+ "min": 800.0,
405
+ "max": 3700.0
406
+ },
407
+ {
408
+ "current": 3700.0,
409
+ "min": 800.0,
410
+ "max": 3700.0
411
+ },
412
+ {
413
+ "current": 3700.0,
414
+ "min": 800.0,
415
+ "max": 3700.0
416
+ },
417
+ {
418
+ "current": 3700.0,
419
+ "min": 800.0,
420
+ "max": 3700.0
421
+ },
422
+ {
423
+ "current": 3700.0,
424
+ "min": 800.0,
425
+ "max": 3700.0
426
+ },
427
+ {
428
+ "current": 3700.0,
429
+ "min": 800.0,
430
+ "max": 3700.0
431
+ },
432
+ {
433
+ "current": 3700.0,
434
+ "min": 800.0,
435
+ "max": 3700.0
436
+ },
437
+ {
438
+ "current": 3700.0,
439
+ "min": 800.0,
440
+ "max": 3700.0
441
+ },
442
+ {
443
+ "current": 3700.0,
444
+ "min": 800.0,
445
+ "max": 3700.0
446
+ },
447
+ {
448
+ "current": 3700.0,
449
+ "min": 800.0,
450
+ "max": 3700.0
451
+ },
452
+ {
453
+ "current": 3700.0,
454
+ "min": 800.0,
455
+ "max": 3700.0
456
+ },
457
+ {
458
+ "current": 3700.0,
459
+ "min": 800.0,
460
+ "max": 3700.0
461
+ },
462
+ {
463
+ "current": 3700.0,
464
+ "min": 800.0,
465
+ "max": 3700.0
466
+ },
467
+ {
468
+ "current": 3700.0,
469
+ "min": 800.0,
470
+ "max": 3700.0
471
+ },
472
+ {
473
+ "current": 3700.0,
474
+ "min": 800.0,
475
+ "max": 3700.0
476
+ },
477
+ {
478
+ "current": 3700.0,
479
+ "min": 800.0,
480
+ "max": 3700.0
481
+ },
482
+ {
483
+ "current": 3700.0,
484
+ "min": 800.0,
485
+ "max": 3700.0
486
+ },
487
+ {
488
+ "current": 3700.0,
489
+ "min": 800.0,
490
+ "max": 3700.0
491
+ },
492
+ {
493
+ "current": 3700.0,
494
+ "min": 800.0,
495
+ "max": 3700.0
496
+ },
497
+ {
498
+ "current": 3700.0,
499
+ "min": 800.0,
500
+ "max": 3700.0
501
+ },
502
+ {
503
+ "current": 3700.0,
504
+ "min": 800.0,
505
+ "max": 3700.0
506
+ },
507
+ {
508
+ "current": 3700.0,
509
+ "min": 800.0,
510
+ "max": 3700.0
511
+ },
512
+ {
513
+ "current": 3700.0,
514
+ "min": 800.0,
515
+ "max": 3700.0
516
+ },
517
+ {
518
+ "current": 3700.0,
519
+ "min": 800.0,
520
+ "max": 3700.0
521
+ },
522
+ {
523
+ "current": 3700.0,
524
+ "min": 800.0,
525
+ "max": 3700.0
526
+ },
527
+ {
528
+ "current": 3700.0,
529
+ "min": 800.0,
530
+ "max": 3700.0
531
+ },
532
+ {
533
+ "current": 3700.0,
534
+ "min": 800.0,
535
+ "max": 3700.0
536
+ },
537
+ {
538
+ "current": 3700.0,
539
+ "min": 800.0,
540
+ "max": 3700.0
541
+ },
542
+ {
543
+ "current": 3700.0,
544
+ "min": 800.0,
545
+ "max": 3700.0
546
+ },
547
+ {
548
+ "current": 3700.0,
549
+ "min": 800.0,
550
+ "max": 3700.0
551
+ },
552
+ {
553
+ "current": 3700.0,
554
+ "min": 800.0,
555
+ "max": 3700.0
556
+ },
557
+ {
558
+ "current": 3700.0,
559
+ "min": 800.0,
560
+ "max": 3700.0
561
+ },
562
+ {
563
+ "current": 3697.752,
564
+ "min": 800.0,
565
+ "max": 3700.0
566
+ },
567
+ {
568
+ "current": 3700.0,
569
+ "min": 800.0,
570
+ "max": 3700.0
571
+ },
572
+ {
573
+ "current": 3700.0,
574
+ "min": 800.0,
575
+ "max": 3700.0
576
+ },
577
+ {
578
+ "current": 3700.0,
579
+ "min": 800.0,
580
+ "max": 3700.0
581
+ },
582
+ {
583
+ "current": 3700.0,
584
+ "min": 800.0,
585
+ "max": 3700.0
586
+ },
587
+ {
588
+ "current": 3700.0,
589
+ "min": 800.0,
590
+ "max": 3700.0
591
+ },
592
+ {
593
+ "current": 3700.0,
594
+ "min": 800.0,
595
+ "max": 3700.0
596
+ },
597
+ {
598
+ "current": 3700.0,
599
+ "min": 800.0,
600
+ "max": 3700.0
601
+ },
602
+ {
603
+ "current": 3700.0,
604
+ "min": 800.0,
605
+ "max": 3700.0
606
+ },
607
+ {
608
+ "current": 3700.0,
609
+ "min": 800.0,
610
+ "max": 3700.0
611
+ },
612
+ {
613
+ "current": 3700.0,
614
+ "min": 800.0,
615
+ "max": 3700.0
616
+ },
617
+ {
618
+ "current": 3700.001,
619
+ "min": 800.0,
620
+ "max": 3700.0
621
+ },
622
+ {
623
+ "current": 3700.0,
624
+ "min": 800.0,
625
+ "max": 3700.0
626
+ },
627
+ {
628
+ "current": 3700.0,
629
+ "min": 800.0,
630
+ "max": 3700.0
631
+ },
632
+ {
633
+ "current": 3700.0,
634
+ "min": 800.0,
635
+ "max": 3700.0
636
+ },
637
+ {
638
+ "current": 3700.0,
639
+ "min": 800.0,
640
+ "max": 3700.0
641
+ },
642
+ {
643
+ "current": 3700.0,
644
+ "min": 800.0,
645
+ "max": 3700.0
646
+ },
647
+ {
648
+ "current": 3700.0,
649
+ "min": 800.0,
650
+ "max": 3700.0
651
+ },
652
+ {
653
+ "current": 3700.0,
654
+ "min": 800.0,
655
+ "max": 3700.0
656
+ },
657
+ {
658
+ "current": 3700.0,
659
+ "min": 800.0,
660
+ "max": 3700.0
661
+ },
662
+ {
663
+ "current": 3700.0,
664
+ "min": 800.0,
665
+ "max": 3700.0
666
+ },
667
+ {
668
+ "current": 3700.0,
669
+ "min": 800.0,
670
+ "max": 3700.0
671
+ },
672
+ {
673
+ "current": 3700.0,
674
+ "min": 800.0,
675
+ "max": 3700.0
676
+ },
677
+ {
678
+ "current": 3700.0,
679
+ "min": 800.0,
680
+ "max": 3700.0
681
+ },
682
+ {
683
+ "current": 3700.0,
684
+ "min": 800.0,
685
+ "max": 3700.0
686
+ },
687
+ {
688
+ "current": 3700.0,
689
+ "min": 800.0,
690
+ "max": 3700.0
691
+ },
692
+ {
693
+ "current": 3700.0,
694
+ "min": 800.0,
695
+ "max": 3700.0
696
+ },
697
+ {
698
+ "current": 3700.0,
699
+ "min": 800.0,
700
+ "max": 3700.0
701
+ },
702
+ {
703
+ "current": 3700.0,
704
+ "min": 800.0,
705
+ "max": 3700.0
706
+ },
707
+ {
708
+ "current": 3700.0,
709
+ "min": 800.0,
710
+ "max": 3700.0
711
+ },
712
+ {
713
+ "current": 3700.0,
714
+ "min": 800.0,
715
+ "max": 3700.0
716
+ },
717
+ {
718
+ "current": 3700.0,
719
+ "min": 800.0,
720
+ "max": 3700.0
721
+ },
722
+ {
723
+ "current": 3700.0,
724
+ "min": 800.0,
725
+ "max": 3700.0
726
+ },
727
+ {
728
+ "current": 3700.0,
729
+ "min": 800.0,
730
+ "max": 3700.0
731
+ },
732
+ {
733
+ "current": 3700.0,
734
+ "min": 800.0,
735
+ "max": 3700.0
736
+ },
737
+ {
738
+ "current": 3700.0,
739
+ "min": 800.0,
740
+ "max": 3700.0
741
+ },
742
+ {
743
+ "current": 3700.0,
744
+ "min": 800.0,
745
+ "max": 3700.0
746
+ },
747
+ {
748
+ "current": 3700.0,
749
+ "min": 800.0,
750
+ "max": 3700.0
751
+ },
752
+ {
753
+ "current": 3700.0,
754
+ "min": 800.0,
755
+ "max": 3700.0
756
+ },
757
+ {
758
+ "current": 3700.0,
759
+ "min": 800.0,
760
+ "max": 3700.0
761
+ },
762
+ {
763
+ "current": 3700.0,
764
+ "min": 800.0,
765
+ "max": 3700.0
766
+ },
767
+ {
768
+ "current": 3700.0,
769
+ "min": 800.0,
770
+ "max": 3700.0
771
+ },
772
+ {
773
+ "current": 3700.0,
774
+ "min": 800.0,
775
+ "max": 3700.0
776
+ },
777
+ {
778
+ "current": 3700.0,
779
+ "min": 800.0,
780
+ "max": 3700.0
781
+ },
782
+ {
783
+ "current": 3700.0,
784
+ "min": 800.0,
785
+ "max": 3700.0
786
+ },
787
+ {
788
+ "current": 3700.0,
789
+ "min": 800.0,
790
+ "max": 3700.0
791
+ },
792
+ {
793
+ "current": 3700.0,
794
+ "min": 800.0,
795
+ "max": 3700.0
796
+ },
797
+ {
798
+ "current": 3700.0,
799
+ "min": 800.0,
800
+ "max": 3700.0
801
+ },
802
+ {
803
+ "current": 3700.0,
804
+ "min": 800.0,
805
+ "max": 3700.0
806
+ },
807
+ {
808
+ "current": 3700.0,
809
+ "min": 800.0,
810
+ "max": 3700.0
811
+ },
812
+ {
813
+ "current": 3700.0,
814
+ "min": 800.0,
815
+ "max": 3700.0
816
+ },
817
+ {
818
+ "current": 3700.0,
819
+ "min": 800.0,
820
+ "max": 3700.0
821
+ },
822
+ {
823
+ "current": 3700.0,
824
+ "min": 800.0,
825
+ "max": 3700.0
826
+ },
827
+ {
828
+ "current": 3700.0,
829
+ "min": 800.0,
830
+ "max": 3700.0
831
+ },
832
+ {
833
+ "current": 3700.0,
834
+ "min": 800.0,
835
+ "max": 3700.0
836
+ },
837
+ {
838
+ "current": 3700.0,
839
+ "min": 800.0,
840
+ "max": 3700.0
841
+ },
842
+ {
843
+ "current": 3700.0,
844
+ "min": 800.0,
845
+ "max": 3700.0
846
+ },
847
+ {
848
+ "current": 3700.0,
849
+ "min": 800.0,
850
+ "max": 3700.0
851
+ },
852
+ {
853
+ "current": 3700.0,
854
+ "min": 800.0,
855
+ "max": 3700.0
856
+ },
857
+ {
858
+ "current": 3700.0,
859
+ "min": 800.0,
860
+ "max": 3700.0
861
+ },
862
+ {
863
+ "current": 3700.0,
864
+ "min": 800.0,
865
+ "max": 3700.0
866
+ },
867
+ {
868
+ "current": 3700.0,
869
+ "min": 800.0,
870
+ "max": 3700.0
871
+ },
872
+ {
873
+ "current": 3700.0,
874
+ "min": 800.0,
875
+ "max": 3700.0
876
+ },
877
+ {
878
+ "current": 3700.0,
879
+ "min": 800.0,
880
+ "max": 3700.0
881
+ },
882
+ {
883
+ "current": 3700.0,
884
+ "min": 800.0,
885
+ "max": 3700.0
886
+ },
887
+ {
888
+ "current": 3700.0,
889
+ "min": 800.0,
890
+ "max": 3700.0
891
+ },
892
+ {
893
+ "current": 3700.0,
894
+ "min": 800.0,
895
+ "max": 3700.0
896
+ },
897
+ {
898
+ "current": 3700.0,
899
+ "min": 800.0,
900
+ "max": 3700.0
901
+ }
902
+ ],
903
+ "disk": {
904
+ "/": {
905
+ "total": 65243.953125,
906
+ "used": 15974.58984375
907
+ }
908
+ },
909
+ "gpu": "NVIDIA H100",
910
+ "gpu_count": 4,
911
+ "gpu_devices": [
912
+ {
913
+ "name": "NVIDIA H100",
914
+ "memory_total": 68416438272
915
+ },
916
+ {
917
+ "name": "NVIDIA H100",
918
+ "memory_total": 68416438272
919
+ },
920
+ {
921
+ "name": "NVIDIA H100",
922
+ "memory_total": 68416438272
923
+ },
924
+ {
925
+ "name": "NVIDIA H100",
926
+ "memory_total": 68416438272
927
+ }
928
+ ],
929
+ "memory": {
930
+ "total": 503.48146057128906
931
+ }
932
+ }
wandb/offline-run-20250402_134739-nwi5m2nq/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 0.7396, "train/grad_norm": 1.615809772493014, "train/learning_rate": 0.0, "train/epoch": 1.9995261786306564, "train/global_step": 2110, "_timestamp": 1743652092.048609, "_runtime": 57632.64785003662, "_step": 2205, "eval/loss": 1.0688345432281494, "eval/runtime": 63.103, "eval/samples_per_second": 43.231, "eval/steps_per_second": 0.681, "train_runtime": 57633.4394, "train_samples_per_second": 9.374, "train_steps_per_second": 0.037, "total_flos": 7068238416445440.0, "train_loss": 0.9307103937271082, "_wandb": {"runtime": 57646}}
wandb/offline-run-20250402_134739-nwi5m2nq/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/offline-run-20250402_134739-nwi5m2nq/logs/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Current SDK version is 0.17.0
2
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Configure stats pid to 1538306
3
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Loading settings from /home/bsc/bsc088369/.config/wandb/settings
4
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Loading settings from /gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/wandb/settings
5
+ 2025-04-02 13:47:39,397 WARNING MainThread:1538306 [wandb_setup.py:_flush():76] Unknown environment variable: WANDB_CONFIG_DIR
6
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'root_dir': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0', 'init_timeout': '600', 'project': 'instruction-tuning', 'run_name': 'iberianLLM_7B_xdogeCPT_34k_instruct-v1.0_18503701', 'mode': 'offline'}
7
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
8
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'fastchat/train/train.py', 'program_abspath': '/gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/fastchat/train/train.py', 'program': '/gpfs/projects/bsc88/text/models/instruction-tuning/it-chat-v1/fastchat/train/train.py'}
9
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:_log_setup():520] Logging user logs to /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/wandb/offline-run-20250402_134739-nwi5m2nq/logs/debug.log
10
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:_log_setup():521] Logging internal logs to /gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/wandb/offline-run-20250402_134739-nwi5m2nq/logs/debug-internal.log
11
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():560] calling init triggers
12
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():610] starting backend
15
+ 2025-04-02 13:47:39,397 INFO MainThread:1538306 [wandb_init.py:init():614] setting up manager
16
+ 2025-04-02 13:47:39,398 INFO MainThread:1538306 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2025-04-02 13:47:39,400 INFO MainThread:1538306 [wandb_init.py:init():622] backend started and connected
18
+ 2025-04-02 13:47:39,403 INFO MainThread:1538306 [wandb_init.py:init():711] updated telemetry
19
+ 2025-04-02 13:47:39,500 INFO MainThread:1538306 [wandb_init.py:init():744] communicating run to backend with 600.0 second timeout
20
+ 2025-04-02 13:47:39,504 INFO MainThread:1538306 [wandb_init.py:init():795] starting run threads in backend
21
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_console_start():2374] atexit reg
22
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_redirect():2229] redirect: wrap_raw
23
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_redirect():2294] Wrapping output streams.
24
+ 2025-04-02 13:47:40,849 INFO MainThread:1538306 [wandb_run.py:_redirect():2319] Redirects installed.
25
+ 2025-04-02 13:47:40,892 INFO MainThread:1538306 [wandb_init.py:init():838] run started, returning control to user process
26
+ 2025-04-02 13:47:40,893 INFO MainThread:1538306 [wandb_run.py:_config_callback():1376] config_cb None None {'vocab_size': 256000, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/base_models_with_special_tokens/iberianLLM_7B_xdogeCPT_34k', 'transformers_version': '4.40.2', 'head_dim': 128, 'mlp_bias': False, 'model_type': 'llama', 'output_dir': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 1e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0/runs/Apr02_13-46-41_as03r3b01', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 15, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 0.01, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/gpfs/projects/bsc88/text/models/instruction-tuning/models/checkpoints/iberoLLM_april25/iberianLLM_7B_xdogeCPT_34k_instruct-v1.0', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_configs/ds_type3_config_autombs.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': 5.0, 'optim_target_modules': None, 'cache_dir': None, 'model_max_length': 8192}
27
+ 2025-04-03 05:48:27,018 WARNING MsgRouterThr:1538306 [router.py:message_loop():77] message_loop has been closed
wandb/offline-run-20250402_134739-nwi5m2nq/run-nwi5m2nq.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477cbc8e46ae66274b7508f2abc3b45976e10b9cbe1b5c0468a9a6b966408b63
3
+ size 115718034
wandb/offline-run-20250402_134739-nwi5m2nq/run-nwi5m2nq.wandb.synced ADDED
File without changes