LogicBombaklot commited on
Commit
0149464
·
verified ·
1 Parent(s): 39ffa85

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +1492 -0
config.json ADDED
@@ -0,0 +1,1492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeciLMForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_decilm.DeciLMConfig",
9
+ "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM"
10
+ },
11
+ "block_configs": [
12
+ {
13
+ "attention": {
14
+ "n_heads_in_group": 8,
15
+ "no_op": false,
16
+ "num_sink_tokens": null,
17
+ "replace_with_linear": false,
18
+ "sparsify": null,
19
+ "unshifted_sink": false,
20
+ "use_prefill_window_in_sink_attention": false,
21
+ "window_length": null
22
+ },
23
+ "ffn": {
24
+ "ffn_mult": 2.625,
25
+ "no_op": false,
26
+ "replace_with_linear": false,
27
+ "sparsify": null
28
+ }
29
+ },
30
+ {
31
+ "attention": {
32
+ "n_heads_in_group": 8,
33
+ "no_op": false,
34
+ "num_sink_tokens": null,
35
+ "replace_with_linear": false,
36
+ "sparsify": null,
37
+ "unshifted_sink": false,
38
+ "use_prefill_window_in_sink_attention": false,
39
+ "window_length": null
40
+ },
41
+ "ffn": {
42
+ "ffn_mult": 5.25,
43
+ "no_op": false,
44
+ "replace_with_linear": false,
45
+ "sparsify": null
46
+ }
47
+ },
48
+ {
49
+ "attention": {
50
+ "n_heads_in_group": 8,
51
+ "no_op": false,
52
+ "num_sink_tokens": null,
53
+ "replace_with_linear": false,
54
+ "sparsify": null,
55
+ "unshifted_sink": false,
56
+ "use_prefill_window_in_sink_attention": false,
57
+ "window_length": null
58
+ },
59
+ "ffn": {
60
+ "ffn_mult": 5.25,
61
+ "no_op": false,
62
+ "replace_with_linear": false,
63
+ "sparsify": null
64
+ }
65
+ },
66
+ {
67
+ "attention": {
68
+ "n_heads_in_group": 8,
69
+ "no_op": false,
70
+ "num_sink_tokens": null,
71
+ "replace_with_linear": false,
72
+ "sparsify": null,
73
+ "unshifted_sink": false,
74
+ "use_prefill_window_in_sink_attention": false,
75
+ "window_length": null
76
+ },
77
+ "ffn": {
78
+ "ffn_mult": 5.25,
79
+ "no_op": false,
80
+ "replace_with_linear": false,
81
+ "sparsify": null
82
+ }
83
+ },
84
+ {
85
+ "attention": {
86
+ "n_heads_in_group": 8,
87
+ "no_op": false,
88
+ "num_sink_tokens": null,
89
+ "replace_with_linear": false,
90
+ "sparsify": null,
91
+ "unshifted_sink": false,
92
+ "use_prefill_window_in_sink_attention": false,
93
+ "window_length": null
94
+ },
95
+ "ffn": {
96
+ "ffn_mult": 5.25,
97
+ "no_op": false,
98
+ "replace_with_linear": false,
99
+ "sparsify": null
100
+ }
101
+ },
102
+ {
103
+ "attention": {
104
+ "n_heads_in_group": 8,
105
+ "no_op": false,
106
+ "num_sink_tokens": null,
107
+ "replace_with_linear": false,
108
+ "sparsify": null,
109
+ "unshifted_sink": false,
110
+ "use_prefill_window_in_sink_attention": false,
111
+ "window_length": null
112
+ },
113
+ "ffn": {
114
+ "ffn_mult": 5.25,
115
+ "no_op": false,
116
+ "replace_with_linear": false,
117
+ "sparsify": null
118
+ }
119
+ },
120
+ {
121
+ "attention": {
122
+ "n_heads_in_group": null,
123
+ "no_op": true,
124
+ "num_sink_tokens": null,
125
+ "replace_with_linear": false,
126
+ "sparsify": null,
127
+ "unshifted_sink": false,
128
+ "use_prefill_window_in_sink_attention": false,
129
+ "window_length": null
130
+ },
131
+ "ffn": {
132
+ "ffn_mult": 2.625,
133
+ "no_op": false,
134
+ "replace_with_linear": false,
135
+ "sparsify": null
136
+ }
137
+ },
138
+ {
139
+ "attention": {
140
+ "n_heads_in_group": null,
141
+ "no_op": true,
142
+ "num_sink_tokens": null,
143
+ "replace_with_linear": false,
144
+ "sparsify": null,
145
+ "unshifted_sink": false,
146
+ "use_prefill_window_in_sink_attention": false,
147
+ "window_length": null
148
+ },
149
+ "ffn": {
150
+ "ffn_mult": 2.625,
151
+ "no_op": false,
152
+ "replace_with_linear": false,
153
+ "sparsify": null
154
+ }
155
+ },
156
+ {
157
+ "attention": {
158
+ "n_heads_in_group": 8,
159
+ "no_op": false,
160
+ "num_sink_tokens": null,
161
+ "replace_with_linear": false,
162
+ "sparsify": null,
163
+ "unshifted_sink": false,
164
+ "use_prefill_window_in_sink_attention": false,
165
+ "window_length": null
166
+ },
167
+ "ffn": {
168
+ "ffn_mult": 5.25,
169
+ "no_op": false,
170
+ "replace_with_linear": false,
171
+ "sparsify": null
172
+ }
173
+ },
174
+ {
175
+ "attention": {
176
+ "n_heads_in_group": 8,
177
+ "no_op": false,
178
+ "num_sink_tokens": null,
179
+ "replace_with_linear": false,
180
+ "sparsify": null,
181
+ "unshifted_sink": false,
182
+ "use_prefill_window_in_sink_attention": false,
183
+ "window_length": null
184
+ },
185
+ "ffn": {
186
+ "ffn_mult": 5.25,
187
+ "no_op": false,
188
+ "replace_with_linear": false,
189
+ "sparsify": null
190
+ }
191
+ },
192
+ {
193
+ "attention": {
194
+ "n_heads_in_group": 8,
195
+ "no_op": false,
196
+ "num_sink_tokens": null,
197
+ "replace_with_linear": false,
198
+ "sparsify": null,
199
+ "unshifted_sink": false,
200
+ "use_prefill_window_in_sink_attention": false,
201
+ "window_length": null
202
+ },
203
+ "ffn": {
204
+ "ffn_mult": 5.25,
205
+ "no_op": false,
206
+ "replace_with_linear": false,
207
+ "sparsify": null
208
+ }
209
+ },
210
+ {
211
+ "attention": {
212
+ "n_heads_in_group": null,
213
+ "no_op": true,
214
+ "num_sink_tokens": null,
215
+ "replace_with_linear": false,
216
+ "sparsify": null,
217
+ "unshifted_sink": false,
218
+ "use_prefill_window_in_sink_attention": false,
219
+ "window_length": null
220
+ },
221
+ "ffn": {
222
+ "ffn_mult": 3.28125,
223
+ "no_op": false,
224
+ "replace_with_linear": false,
225
+ "sparsify": null
226
+ }
227
+ },
228
+ {
229
+ "attention": {
230
+ "n_heads_in_group": 8,
231
+ "no_op": false,
232
+ "num_sink_tokens": null,
233
+ "replace_with_linear": false,
234
+ "sparsify": null,
235
+ "unshifted_sink": false,
236
+ "use_prefill_window_in_sink_attention": false,
237
+ "window_length": null
238
+ },
239
+ "ffn": {
240
+ "ffn_mult": 5.25,
241
+ "no_op": false,
242
+ "replace_with_linear": false,
243
+ "sparsify": null
244
+ }
245
+ },
246
+ {
247
+ "attention": {
248
+ "n_heads_in_group": 8,
249
+ "no_op": false,
250
+ "num_sink_tokens": null,
251
+ "replace_with_linear": false,
252
+ "sparsify": null,
253
+ "unshifted_sink": false,
254
+ "use_prefill_window_in_sink_attention": false,
255
+ "window_length": null
256
+ },
257
+ "ffn": {
258
+ "ffn_mult": 5.25,
259
+ "no_op": false,
260
+ "replace_with_linear": false,
261
+ "sparsify": null
262
+ }
263
+ },
264
+ {
265
+ "attention": {
266
+ "n_heads_in_group": 8,
267
+ "no_op": false,
268
+ "num_sink_tokens": null,
269
+ "replace_with_linear": false,
270
+ "sparsify": null,
271
+ "unshifted_sink": false,
272
+ "use_prefill_window_in_sink_attention": false,
273
+ "window_length": null
274
+ },
275
+ "ffn": {
276
+ "ffn_mult": 5.25,
277
+ "no_op": false,
278
+ "replace_with_linear": false,
279
+ "sparsify": null
280
+ }
281
+ },
282
+ {
283
+ "attention": {
284
+ "n_heads_in_group": 8,
285
+ "no_op": false,
286
+ "num_sink_tokens": null,
287
+ "replace_with_linear": false,
288
+ "sparsify": null,
289
+ "unshifted_sink": false,
290
+ "use_prefill_window_in_sink_attention": false,
291
+ "window_length": null
292
+ },
293
+ "ffn": {
294
+ "ffn_mult": 5.25,
295
+ "no_op": false,
296
+ "replace_with_linear": false,
297
+ "sparsify": null
298
+ }
299
+ },
300
+ {
301
+ "attention": {
302
+ "n_heads_in_group": 8,
303
+ "no_op": false,
304
+ "num_sink_tokens": null,
305
+ "replace_with_linear": false,
306
+ "sparsify": null,
307
+ "unshifted_sink": false,
308
+ "use_prefill_window_in_sink_attention": false,
309
+ "window_length": null
310
+ },
311
+ "ffn": {
312
+ "ffn_mult": 5.25,
313
+ "no_op": false,
314
+ "replace_with_linear": false,
315
+ "sparsify": null
316
+ }
317
+ },
318
+ {
319
+ "attention": {
320
+ "n_heads_in_group": 8,
321
+ "no_op": false,
322
+ "num_sink_tokens": null,
323
+ "replace_with_linear": false,
324
+ "sparsify": null,
325
+ "unshifted_sink": false,
326
+ "use_prefill_window_in_sink_attention": false,
327
+ "window_length": null
328
+ },
329
+ "ffn": {
330
+ "ffn_mult": 5.25,
331
+ "no_op": false,
332
+ "replace_with_linear": false,
333
+ "sparsify": null
334
+ }
335
+ },
336
+ {
337
+ "attention": {
338
+ "n_heads_in_group": 8,
339
+ "no_op": false,
340
+ "num_sink_tokens": null,
341
+ "replace_with_linear": false,
342
+ "sparsify": null,
343
+ "unshifted_sink": false,
344
+ "use_prefill_window_in_sink_attention": false,
345
+ "window_length": null
346
+ },
347
+ "ffn": {
348
+ "ffn_mult": 5.25,
349
+ "no_op": false,
350
+ "replace_with_linear": false,
351
+ "sparsify": null
352
+ }
353
+ },
354
+ {
355
+ "attention": {
356
+ "n_heads_in_group": 8,
357
+ "no_op": false,
358
+ "num_sink_tokens": null,
359
+ "replace_with_linear": false,
360
+ "sparsify": null,
361
+ "unshifted_sink": false,
362
+ "use_prefill_window_in_sink_attention": false,
363
+ "window_length": null
364
+ },
365
+ "ffn": {
366
+ "ffn_mult": 5.25,
367
+ "no_op": false,
368
+ "replace_with_linear": false,
369
+ "sparsify": null
370
+ }
371
+ },
372
+ {
373
+ "attention": {
374
+ "n_heads_in_group": 8,
375
+ "no_op": false,
376
+ "num_sink_tokens": null,
377
+ "replace_with_linear": false,
378
+ "sparsify": null,
379
+ "unshifted_sink": false,
380
+ "use_prefill_window_in_sink_attention": false,
381
+ "window_length": null
382
+ },
383
+ "ffn": {
384
+ "ffn_mult": 5.25,
385
+ "no_op": false,
386
+ "replace_with_linear": false,
387
+ "sparsify": null
388
+ }
389
+ },
390
+ {
391
+ "attention": {
392
+ "n_heads_in_group": 8,
393
+ "no_op": false,
394
+ "num_sink_tokens": null,
395
+ "replace_with_linear": false,
396
+ "sparsify": null,
397
+ "unshifted_sink": false,
398
+ "use_prefill_window_in_sink_attention": false,
399
+ "window_length": null
400
+ },
401
+ "ffn": {
402
+ "ffn_mult": 5.25,
403
+ "no_op": false,
404
+ "replace_with_linear": false,
405
+ "sparsify": null
406
+ }
407
+ },
408
+ {
409
+ "attention": {
410
+ "n_heads_in_group": 8,
411
+ "no_op": false,
412
+ "num_sink_tokens": null,
413
+ "replace_with_linear": false,
414
+ "sparsify": null,
415
+ "unshifted_sink": false,
416
+ "use_prefill_window_in_sink_attention": false,
417
+ "window_length": null
418
+ },
419
+ "ffn": {
420
+ "ffn_mult": 5.25,
421
+ "no_op": false,
422
+ "replace_with_linear": false,
423
+ "sparsify": null
424
+ }
425
+ },
426
+ {
427
+ "attention": {
428
+ "n_heads_in_group": 8,
429
+ "no_op": false,
430
+ "num_sink_tokens": null,
431
+ "replace_with_linear": false,
432
+ "sparsify": null,
433
+ "unshifted_sink": false,
434
+ "use_prefill_window_in_sink_attention": false,
435
+ "window_length": null
436
+ },
437
+ "ffn": {
438
+ "ffn_mult": 5.25,
439
+ "no_op": false,
440
+ "replace_with_linear": false,
441
+ "sparsify": null
442
+ }
443
+ },
444
+ {
445
+ "attention": {
446
+ "n_heads_in_group": 8,
447
+ "no_op": false,
448
+ "num_sink_tokens": null,
449
+ "replace_with_linear": false,
450
+ "sparsify": null,
451
+ "unshifted_sink": false,
452
+ "use_prefill_window_in_sink_attention": false,
453
+ "window_length": null
454
+ },
455
+ "ffn": {
456
+ "ffn_mult": 5.25,
457
+ "no_op": false,
458
+ "replace_with_linear": false,
459
+ "sparsify": null
460
+ }
461
+ },
462
+ {
463
+ "attention": {
464
+ "n_heads_in_group": 8,
465
+ "no_op": false,
466
+ "num_sink_tokens": null,
467
+ "replace_with_linear": false,
468
+ "sparsify": null,
469
+ "unshifted_sink": false,
470
+ "use_prefill_window_in_sink_attention": false,
471
+ "window_length": null
472
+ },
473
+ "ffn": {
474
+ "ffn_mult": 5.25,
475
+ "no_op": false,
476
+ "replace_with_linear": false,
477
+ "sparsify": null
478
+ }
479
+ },
480
+ {
481
+ "attention": {
482
+ "n_heads_in_group": 8,
483
+ "no_op": false,
484
+ "num_sink_tokens": null,
485
+ "replace_with_linear": false,
486
+ "sparsify": null,
487
+ "unshifted_sink": false,
488
+ "use_prefill_window_in_sink_attention": false,
489
+ "window_length": null
490
+ },
491
+ "ffn": {
492
+ "ffn_mult": 5.25,
493
+ "no_op": false,
494
+ "replace_with_linear": false,
495
+ "sparsify": null
496
+ }
497
+ },
498
+ {
499
+ "attention": {
500
+ "n_heads_in_group": 8,
501
+ "no_op": false,
502
+ "num_sink_tokens": null,
503
+ "replace_with_linear": false,
504
+ "sparsify": null,
505
+ "unshifted_sink": false,
506
+ "use_prefill_window_in_sink_attention": false,
507
+ "window_length": null
508
+ },
509
+ "ffn": {
510
+ "ffn_mult": 5.25,
511
+ "no_op": false,
512
+ "replace_with_linear": false,
513
+ "sparsify": null
514
+ }
515
+ },
516
+ {
517
+ "attention": {
518
+ "n_heads_in_group": 8,
519
+ "no_op": false,
520
+ "num_sink_tokens": null,
521
+ "replace_with_linear": false,
522
+ "sparsify": null,
523
+ "unshifted_sink": false,
524
+ "use_prefill_window_in_sink_attention": false,
525
+ "window_length": null
526
+ },
527
+ "ffn": {
528
+ "ffn_mult": 5.25,
529
+ "no_op": false,
530
+ "replace_with_linear": false,
531
+ "sparsify": null
532
+ }
533
+ },
534
+ {
535
+ "attention": {
536
+ "n_heads_in_group": 8,
537
+ "no_op": false,
538
+ "num_sink_tokens": null,
539
+ "replace_with_linear": false,
540
+ "sparsify": null,
541
+ "unshifted_sink": false,
542
+ "use_prefill_window_in_sink_attention": false,
543
+ "window_length": null
544
+ },
545
+ "ffn": {
546
+ "ffn_mult": 5.25,
547
+ "no_op": false,
548
+ "replace_with_linear": false,
549
+ "sparsify": null
550
+ }
551
+ },
552
+ {
553
+ "attention": {
554
+ "n_heads_in_group": 8,
555
+ "no_op": false,
556
+ "num_sink_tokens": null,
557
+ "replace_with_linear": false,
558
+ "sparsify": null,
559
+ "unshifted_sink": false,
560
+ "use_prefill_window_in_sink_attention": false,
561
+ "window_length": null
562
+ },
563
+ "ffn": {
564
+ "ffn_mult": 5.25,
565
+ "no_op": false,
566
+ "replace_with_linear": false,
567
+ "sparsify": null
568
+ }
569
+ },
570
+ {
571
+ "attention": {
572
+ "n_heads_in_group": 8,
573
+ "no_op": false,
574
+ "num_sink_tokens": null,
575
+ "replace_with_linear": false,
576
+ "sparsify": null,
577
+ "unshifted_sink": false,
578
+ "use_prefill_window_in_sink_attention": false,
579
+ "window_length": null
580
+ },
581
+ "ffn": {
582
+ "ffn_mult": 5.25,
583
+ "no_op": false,
584
+ "replace_with_linear": false,
585
+ "sparsify": null
586
+ }
587
+ },
588
+ {
589
+ "attention": {
590
+ "n_heads_in_group": 8,
591
+ "no_op": false,
592
+ "num_sink_tokens": null,
593
+ "replace_with_linear": false,
594
+ "sparsify": null,
595
+ "unshifted_sink": false,
596
+ "use_prefill_window_in_sink_attention": false,
597
+ "window_length": null
598
+ },
599
+ "ffn": {
600
+ "ffn_mult": 5.25,
601
+ "no_op": false,
602
+ "replace_with_linear": false,
603
+ "sparsify": null
604
+ }
605
+ },
606
+ {
607
+ "attention": {
608
+ "n_heads_in_group": 8,
609
+ "no_op": false,
610
+ "num_sink_tokens": null,
611
+ "replace_with_linear": false,
612
+ "sparsify": null,
613
+ "unshifted_sink": false,
614
+ "use_prefill_window_in_sink_attention": false,
615
+ "window_length": null
616
+ },
617
+ "ffn": {
618
+ "ffn_mult": 5.25,
619
+ "no_op": false,
620
+ "replace_with_linear": false,
621
+ "sparsify": null
622
+ }
623
+ },
624
+ {
625
+ "attention": {
626
+ "n_heads_in_group": 8,
627
+ "no_op": false,
628
+ "num_sink_tokens": null,
629
+ "replace_with_linear": false,
630
+ "sparsify": null,
631
+ "unshifted_sink": false,
632
+ "use_prefill_window_in_sink_attention": false,
633
+ "window_length": null
634
+ },
635
+ "ffn": {
636
+ "ffn_mult": 5.25,
637
+ "no_op": false,
638
+ "replace_with_linear": false,
639
+ "sparsify": null
640
+ }
641
+ },
642
+ {
643
+ "attention": {
644
+ "n_heads_in_group": 8,
645
+ "no_op": false,
646
+ "num_sink_tokens": null,
647
+ "replace_with_linear": false,
648
+ "sparsify": null,
649
+ "unshifted_sink": false,
650
+ "use_prefill_window_in_sink_attention": false,
651
+ "window_length": null
652
+ },
653
+ "ffn": {
654
+ "ffn_mult": 5.25,
655
+ "no_op": false,
656
+ "replace_with_linear": false,
657
+ "sparsify": null
658
+ }
659
+ },
660
+ {
661
+ "attention": {
662
+ "n_heads_in_group": 8,
663
+ "no_op": false,
664
+ "num_sink_tokens": null,
665
+ "replace_with_linear": false,
666
+ "sparsify": null,
667
+ "unshifted_sink": false,
668
+ "use_prefill_window_in_sink_attention": false,
669
+ "window_length": null
670
+ },
671
+ "ffn": {
672
+ "ffn_mult": 5.25,
673
+ "no_op": false,
674
+ "replace_with_linear": false,
675
+ "sparsify": null
676
+ }
677
+ },
678
+ {
679
+ "attention": {
680
+ "n_heads_in_group": 8,
681
+ "no_op": false,
682
+ "num_sink_tokens": null,
683
+ "replace_with_linear": false,
684
+ "sparsify": null,
685
+ "unshifted_sink": false,
686
+ "use_prefill_window_in_sink_attention": false,
687
+ "window_length": null
688
+ },
689
+ "ffn": {
690
+ "ffn_mult": 5.25,
691
+ "no_op": false,
692
+ "replace_with_linear": false,
693
+ "sparsify": null
694
+ }
695
+ },
696
+ {
697
+ "attention": {
698
+ "n_heads_in_group": 8,
699
+ "no_op": false,
700
+ "num_sink_tokens": null,
701
+ "replace_with_linear": false,
702
+ "sparsify": null,
703
+ "unshifted_sink": false,
704
+ "use_prefill_window_in_sink_attention": false,
705
+ "window_length": null
706
+ },
707
+ "ffn": {
708
+ "ffn_mult": 5.25,
709
+ "no_op": false,
710
+ "replace_with_linear": false,
711
+ "sparsify": null
712
+ }
713
+ },
714
+ {
715
+ "attention": {
716
+ "n_heads_in_group": 8,
717
+ "no_op": false,
718
+ "num_sink_tokens": null,
719
+ "replace_with_linear": false,
720
+ "sparsify": null,
721
+ "unshifted_sink": false,
722
+ "use_prefill_window_in_sink_attention": false,
723
+ "window_length": null
724
+ },
725
+ "ffn": {
726
+ "ffn_mult": 5.25,
727
+ "no_op": false,
728
+ "replace_with_linear": false,
729
+ "sparsify": null
730
+ }
731
+ },
732
+ {
733
+ "attention": {
734
+ "n_heads_in_group": 8,
735
+ "no_op": false,
736
+ "num_sink_tokens": null,
737
+ "replace_with_linear": false,
738
+ "sparsify": null,
739
+ "unshifted_sink": false,
740
+ "use_prefill_window_in_sink_attention": false,
741
+ "window_length": null
742
+ },
743
+ "ffn": {
744
+ "ffn_mult": 5.25,
745
+ "no_op": false,
746
+ "replace_with_linear": false,
747
+ "sparsify": null
748
+ }
749
+ },
750
+ {
751
+ "attention": {
752
+ "n_heads_in_group": 8,
753
+ "no_op": false,
754
+ "num_sink_tokens": null,
755
+ "replace_with_linear": false,
756
+ "sparsify": null,
757
+ "unshifted_sink": false,
758
+ "use_prefill_window_in_sink_attention": false,
759
+ "window_length": null
760
+ },
761
+ "ffn": {
762
+ "ffn_mult": 5.25,
763
+ "no_op": false,
764
+ "replace_with_linear": false,
765
+ "sparsify": null
766
+ }
767
+ },
768
+ {
769
+ "attention": {
770
+ "n_heads_in_group": null,
771
+ "no_op": true,
772
+ "num_sink_tokens": null,
773
+ "replace_with_linear": false,
774
+ "sparsify": null,
775
+ "unshifted_sink": false,
776
+ "use_prefill_window_in_sink_attention": false,
777
+ "window_length": null
778
+ },
779
+ "ffn": {
780
+ "ffn_mult": 1.3125,
781
+ "no_op": false,
782
+ "replace_with_linear": false,
783
+ "sparsify": null
784
+ }
785
+ },
786
+ {
787
+ "attention": {
788
+ "n_heads_in_group": null,
789
+ "no_op": true,
790
+ "num_sink_tokens": null,
791
+ "replace_with_linear": false,
792
+ "sparsify": null,
793
+ "unshifted_sink": false,
794
+ "use_prefill_window_in_sink_attention": false,
795
+ "window_length": null
796
+ },
797
+ "ffn": {
798
+ "ffn_mult": 2.625,
799
+ "no_op": false,
800
+ "replace_with_linear": false,
801
+ "sparsify": null
802
+ }
803
+ },
804
+ {
805
+ "attention": {
806
+ "n_heads_in_group": null,
807
+ "no_op": true,
808
+ "num_sink_tokens": null,
809
+ "replace_with_linear": false,
810
+ "sparsify": null,
811
+ "unshifted_sink": false,
812
+ "use_prefill_window_in_sink_attention": false,
813
+ "window_length": null
814
+ },
815
+ "ffn": {
816
+ "ffn_mult": 2.625,
817
+ "no_op": false,
818
+ "replace_with_linear": false,
819
+ "sparsify": null
820
+ }
821
+ },
822
+ {
823
+ "attention": {
824
+ "n_heads_in_group": null,
825
+ "no_op": true,
826
+ "num_sink_tokens": null,
827
+ "replace_with_linear": false,
828
+ "sparsify": null,
829
+ "unshifted_sink": false,
830
+ "use_prefill_window_in_sink_attention": false,
831
+ "window_length": null
832
+ },
833
+ "ffn": {
834
+ "ffn_mult": 1.3125,
835
+ "no_op": false,
836
+ "replace_with_linear": false,
837
+ "sparsify": null
838
+ }
839
+ },
840
+ {
841
+ "attention": {
842
+ "n_heads_in_group": null,
843
+ "no_op": true,
844
+ "num_sink_tokens": null,
845
+ "replace_with_linear": false,
846
+ "sparsify": null,
847
+ "unshifted_sink": false,
848
+ "use_prefill_window_in_sink_attention": false,
849
+ "window_length": null
850
+ },
851
+ "ffn": {
852
+ "ffn_mult": 5.25,
853
+ "no_op": false,
854
+ "replace_with_linear": false,
855
+ "sparsify": null
856
+ }
857
+ },
858
+ {
859
+ "attention": {
860
+ "n_heads_in_group": null,
861
+ "no_op": true,
862
+ "num_sink_tokens": null,
863
+ "replace_with_linear": false,
864
+ "sparsify": null,
865
+ "unshifted_sink": false,
866
+ "use_prefill_window_in_sink_attention": false,
867
+ "window_length": null
868
+ },
869
+ "ffn": {
870
+ "ffn_mult": 1.3125,
871
+ "no_op": false,
872
+ "replace_with_linear": false,
873
+ "sparsify": null
874
+ }
875
+ },
876
+ {
877
+ "attention": {
878
+ "n_heads_in_group": null,
879
+ "no_op": true,
880
+ "num_sink_tokens": null,
881
+ "replace_with_linear": false,
882
+ "sparsify": null,
883
+ "unshifted_sink": false,
884
+ "use_prefill_window_in_sink_attention": false,
885
+ "window_length": null
886
+ },
887
+ "ffn": {
888
+ "ffn_mult": 2.625,
889
+ "no_op": false,
890
+ "replace_with_linear": false,
891
+ "sparsify": null
892
+ }
893
+ },
894
+ {
895
+ "attention": {
896
+ "n_heads_in_group": null,
897
+ "no_op": true,
898
+ "num_sink_tokens": null,
899
+ "replace_with_linear": false,
900
+ "sparsify": null,
901
+ "unshifted_sink": false,
902
+ "use_prefill_window_in_sink_attention": false,
903
+ "window_length": null
904
+ },
905
+ "ffn": {
906
+ "ffn_mult": 1.3125,
907
+ "no_op": false,
908
+ "replace_with_linear": false,
909
+ "sparsify": null
910
+ }
911
+ },
912
+ {
913
+ "attention": {
914
+ "n_heads_in_group": null,
915
+ "no_op": true,
916
+ "num_sink_tokens": null,
917
+ "replace_with_linear": false,
918
+ "sparsify": null,
919
+ "unshifted_sink": false,
920
+ "use_prefill_window_in_sink_attention": false,
921
+ "window_length": null
922
+ },
923
+ "ffn": {
924
+ "ffn_mult": 1.3125,
925
+ "no_op": false,
926
+ "replace_with_linear": false,
927
+ "sparsify": null
928
+ }
929
+ },
930
+ {
931
+ "attention": {
932
+ "n_heads_in_group": null,
933
+ "no_op": true,
934
+ "num_sink_tokens": null,
935
+ "replace_with_linear": false,
936
+ "sparsify": null,
937
+ "unshifted_sink": false,
938
+ "use_prefill_window_in_sink_attention": false,
939
+ "window_length": null
940
+ },
941
+ "ffn": {
942
+ "ffn_mult": 1.3125,
943
+ "no_op": false,
944
+ "replace_with_linear": false,
945
+ "sparsify": null
946
+ }
947
+ },
948
+ {
949
+ "attention": {
950
+ "n_heads_in_group": 8,
951
+ "no_op": false,
952
+ "num_sink_tokens": null,
953
+ "replace_with_linear": false,
954
+ "sparsify": null,
955
+ "unshifted_sink": false,
956
+ "use_prefill_window_in_sink_attention": false,
957
+ "window_length": null
958
+ },
959
+ "ffn": {
960
+ "ffn_mult": 5.25,
961
+ "no_op": false,
962
+ "replace_with_linear": false,
963
+ "sparsify": null
964
+ }
965
+ },
966
+ {
967
+ "attention": {
968
+ "n_heads_in_group": null,
969
+ "no_op": true,
970
+ "num_sink_tokens": null,
971
+ "replace_with_linear": false,
972
+ "sparsify": null,
973
+ "unshifted_sink": false,
974
+ "use_prefill_window_in_sink_attention": false,
975
+ "window_length": null
976
+ },
977
+ "ffn": {
978
+ "ffn_mult": 1.3125,
979
+ "no_op": false,
980
+ "replace_with_linear": false,
981
+ "sparsify": null
982
+ }
983
+ },
984
+ {
985
+ "attention": {
986
+ "n_heads_in_group": null,
987
+ "no_op": true,
988
+ "num_sink_tokens": null,
989
+ "replace_with_linear": false,
990
+ "sparsify": null,
991
+ "unshifted_sink": false,
992
+ "use_prefill_window_in_sink_attention": false,
993
+ "window_length": null
994
+ },
995
+ "ffn": {
996
+ "ffn_mult": 1.0,
997
+ "no_op": false,
998
+ "replace_with_linear": false,
999
+ "sparsify": null
1000
+ }
1001
+ },
1002
+ {
1003
+ "attention": {
1004
+ "n_heads_in_group": null,
1005
+ "no_op": true,
1006
+ "num_sink_tokens": null,
1007
+ "replace_with_linear": false,
1008
+ "sparsify": null,
1009
+ "unshifted_sink": false,
1010
+ "use_prefill_window_in_sink_attention": false,
1011
+ "window_length": null
1012
+ },
1013
+ "ffn": {
1014
+ "ffn_mult": 1.0,
1015
+ "no_op": false,
1016
+ "replace_with_linear": false,
1017
+ "sparsify": null
1018
+ }
1019
+ },
1020
+ {
1021
+ "attention": {
1022
+ "n_heads_in_group": null,
1023
+ "no_op": true,
1024
+ "num_sink_tokens": null,
1025
+ "replace_with_linear": false,
1026
+ "sparsify": null,
1027
+ "unshifted_sink": false,
1028
+ "use_prefill_window_in_sink_attention": false,
1029
+ "window_length": null
1030
+ },
1031
+ "ffn": {
1032
+ "ffn_mult": 1.3125,
1033
+ "no_op": false,
1034
+ "replace_with_linear": false,
1035
+ "sparsify": null
1036
+ }
1037
+ },
1038
+ {
1039
+ "attention": {
1040
+ "n_heads_in_group": null,
1041
+ "no_op": true,
1042
+ "num_sink_tokens": null,
1043
+ "replace_with_linear": false,
1044
+ "sparsify": null,
1045
+ "unshifted_sink": false,
1046
+ "use_prefill_window_in_sink_attention": false,
1047
+ "window_length": null
1048
+ },
1049
+ "ffn": {
1050
+ "ffn_mult": 1.0,
1051
+ "no_op": false,
1052
+ "replace_with_linear": false,
1053
+ "sparsify": null
1054
+ }
1055
+ },
1056
+ {
1057
+ "attention": {
1058
+ "n_heads_in_group": null,
1059
+ "no_op": true,
1060
+ "num_sink_tokens": null,
1061
+ "replace_with_linear": false,
1062
+ "sparsify": null,
1063
+ "unshifted_sink": false,
1064
+ "use_prefill_window_in_sink_attention": false,
1065
+ "window_length": null
1066
+ },
1067
+ "ffn": {
1068
+ "ffn_mult": 1.0,
1069
+ "no_op": false,
1070
+ "replace_with_linear": false,
1071
+ "sparsify": null
1072
+ }
1073
+ },
1074
+ {
1075
+ "attention": {
1076
+ "n_heads_in_group": null,
1077
+ "no_op": true,
1078
+ "num_sink_tokens": null,
1079
+ "replace_with_linear": false,
1080
+ "sparsify": null,
1081
+ "unshifted_sink": false,
1082
+ "use_prefill_window_in_sink_attention": false,
1083
+ "window_length": null
1084
+ },
1085
+ "ffn": {
1086
+ "ffn_mult": 1.0,
1087
+ "no_op": false,
1088
+ "replace_with_linear": false,
1089
+ "sparsify": null
1090
+ }
1091
+ },
1092
+ {
1093
+ "attention": {
1094
+ "n_heads_in_group": null,
1095
+ "no_op": true,
1096
+ "num_sink_tokens": null,
1097
+ "replace_with_linear": false,
1098
+ "sparsify": null,
1099
+ "unshifted_sink": false,
1100
+ "use_prefill_window_in_sink_attention": false,
1101
+ "window_length": null
1102
+ },
1103
+ "ffn": {
1104
+ "ffn_mult": 1.3125,
1105
+ "no_op": false,
1106
+ "replace_with_linear": false,
1107
+ "sparsify": null
1108
+ }
1109
+ },
1110
+ {
1111
+ "attention": {
1112
+ "n_heads_in_group": null,
1113
+ "no_op": true,
1114
+ "num_sink_tokens": null,
1115
+ "replace_with_linear": false,
1116
+ "sparsify": null,
1117
+ "unshifted_sink": false,
1118
+ "use_prefill_window_in_sink_attention": false,
1119
+ "window_length": null
1120
+ },
1121
+ "ffn": {
1122
+ "ffn_mult": 1.3125,
1123
+ "no_op": false,
1124
+ "replace_with_linear": false,
1125
+ "sparsify": null
1126
+ }
1127
+ },
1128
+ {
1129
+ "attention": {
1130
+ "n_heads_in_group": null,
1131
+ "no_op": true,
1132
+ "num_sink_tokens": null,
1133
+ "replace_with_linear": false,
1134
+ "sparsify": null,
1135
+ "unshifted_sink": false,
1136
+ "use_prefill_window_in_sink_attention": false,
1137
+ "window_length": null
1138
+ },
1139
+ "ffn": {
1140
+ "ffn_mult": 0.5,
1141
+ "no_op": false,
1142
+ "replace_with_linear": false,
1143
+ "sparsify": null
1144
+ }
1145
+ },
1146
+ {
1147
+ "attention": {
1148
+ "n_heads_in_group": null,
1149
+ "no_op": true,
1150
+ "num_sink_tokens": null,
1151
+ "replace_with_linear": false,
1152
+ "sparsify": null,
1153
+ "unshifted_sink": false,
1154
+ "use_prefill_window_in_sink_attention": false,
1155
+ "window_length": null
1156
+ },
1157
+ "ffn": {
1158
+ "ffn_mult": 0.5,
1159
+ "no_op": false,
1160
+ "replace_with_linear": false,
1161
+ "sparsify": null
1162
+ }
1163
+ },
1164
+ {
1165
+ "attention": {
1166
+ "n_heads_in_group": null,
1167
+ "no_op": true,
1168
+ "num_sink_tokens": null,
1169
+ "replace_with_linear": false,
1170
+ "sparsify": null,
1171
+ "unshifted_sink": false,
1172
+ "use_prefill_window_in_sink_attention": false,
1173
+ "window_length": null
1174
+ },
1175
+ "ffn": {
1176
+ "ffn_mult": 1.0,
1177
+ "no_op": false,
1178
+ "replace_with_linear": false,
1179
+ "sparsify": null
1180
+ }
1181
+ },
1182
+ {
1183
+ "attention": {
1184
+ "n_heads_in_group": null,
1185
+ "no_op": true,
1186
+ "num_sink_tokens": null,
1187
+ "replace_with_linear": false,
1188
+ "sparsify": null,
1189
+ "unshifted_sink": false,
1190
+ "use_prefill_window_in_sink_attention": false,
1191
+ "window_length": null
1192
+ },
1193
+ "ffn": {
1194
+ "ffn_mult": 1.0,
1195
+ "no_op": false,
1196
+ "replace_with_linear": false,
1197
+ "sparsify": null
1198
+ }
1199
+ },
1200
+ {
1201
+ "attention": {
1202
+ "n_heads_in_group": null,
1203
+ "no_op": true,
1204
+ "num_sink_tokens": null,
1205
+ "replace_with_linear": false,
1206
+ "sparsify": null,
1207
+ "unshifted_sink": false,
1208
+ "use_prefill_window_in_sink_attention": false,
1209
+ "window_length": null
1210
+ },
1211
+ "ffn": {
1212
+ "ffn_mult": 0.5,
1213
+ "no_op": false,
1214
+ "replace_with_linear": false,
1215
+ "sparsify": null
1216
+ }
1217
+ },
1218
+ {
1219
+ "attention": {
1220
+ "n_heads_in_group": null,
1221
+ "no_op": true,
1222
+ "num_sink_tokens": null,
1223
+ "replace_with_linear": false,
1224
+ "sparsify": null,
1225
+ "unshifted_sink": false,
1226
+ "use_prefill_window_in_sink_attention": false,
1227
+ "window_length": null
1228
+ },
1229
+ "ffn": {
1230
+ "ffn_mult": 0.5,
1231
+ "no_op": false,
1232
+ "replace_with_linear": false,
1233
+ "sparsify": null
1234
+ }
1235
+ },
1236
+ {
1237
+ "attention": {
1238
+ "n_heads_in_group": null,
1239
+ "no_op": true,
1240
+ "num_sink_tokens": null,
1241
+ "replace_with_linear": false,
1242
+ "sparsify": null,
1243
+ "unshifted_sink": false,
1244
+ "use_prefill_window_in_sink_attention": false,
1245
+ "window_length": null
1246
+ },
1247
+ "ffn": {
1248
+ "ffn_mult": 1.0,
1249
+ "no_op": false,
1250
+ "replace_with_linear": false,
1251
+ "sparsify": null
1252
+ }
1253
+ },
1254
+ {
1255
+ "attention": {
1256
+ "n_heads_in_group": null,
1257
+ "no_op": true,
1258
+ "num_sink_tokens": null,
1259
+ "replace_with_linear": false,
1260
+ "sparsify": null,
1261
+ "unshifted_sink": false,
1262
+ "use_prefill_window_in_sink_attention": false,
1263
+ "window_length": null
1264
+ },
1265
+ "ffn": {
1266
+ "ffn_mult": 0.5,
1267
+ "no_op": false,
1268
+ "replace_with_linear": false,
1269
+ "sparsify": null
1270
+ }
1271
+ },
1272
+ {
1273
+ "attention": {
1274
+ "n_heads_in_group": null,
1275
+ "no_op": true,
1276
+ "num_sink_tokens": null,
1277
+ "replace_with_linear": false,
1278
+ "sparsify": null,
1279
+ "unshifted_sink": false,
1280
+ "use_prefill_window_in_sink_attention": false,
1281
+ "window_length": null
1282
+ },
1283
+ "ffn": {
1284
+ "ffn_mult": 0.5,
1285
+ "no_op": false,
1286
+ "replace_with_linear": false,
1287
+ "sparsify": null
1288
+ }
1289
+ },
1290
+ {
1291
+ "attention": {
1292
+ "n_heads_in_group": 8,
1293
+ "no_op": false,
1294
+ "num_sink_tokens": null,
1295
+ "replace_with_linear": false,
1296
+ "sparsify": null,
1297
+ "unshifted_sink": false,
1298
+ "use_prefill_window_in_sink_attention": false,
1299
+ "window_length": null
1300
+ },
1301
+ "ffn": {
1302
+ "ffn_mult": 5.25,
1303
+ "no_op": false,
1304
+ "replace_with_linear": false,
1305
+ "sparsify": null
1306
+ }
1307
+ },
1308
+ {
1309
+ "attention": {
1310
+ "n_heads_in_group": 8,
1311
+ "no_op": false,
1312
+ "num_sink_tokens": null,
1313
+ "replace_with_linear": false,
1314
+ "sparsify": null,
1315
+ "unshifted_sink": false,
1316
+ "use_prefill_window_in_sink_attention": false,
1317
+ "window_length": null
1318
+ },
1319
+ "ffn": {
1320
+ "ffn_mult": 5.25,
1321
+ "no_op": false,
1322
+ "replace_with_linear": false,
1323
+ "sparsify": null
1324
+ }
1325
+ },
1326
+ {
1327
+ "attention": {
1328
+ "n_heads_in_group": 8,
1329
+ "no_op": false,
1330
+ "num_sink_tokens": null,
1331
+ "replace_with_linear": false,
1332
+ "sparsify": null,
1333
+ "unshifted_sink": false,
1334
+ "use_prefill_window_in_sink_attention": false,
1335
+ "window_length": null
1336
+ },
1337
+ "ffn": {
1338
+ "ffn_mult": 5.25,
1339
+ "no_op": false,
1340
+ "replace_with_linear": false,
1341
+ "sparsify": null
1342
+ }
1343
+ },
1344
+ {
1345
+ "attention": {
1346
+ "n_heads_in_group": 8,
1347
+ "no_op": false,
1348
+ "num_sink_tokens": null,
1349
+ "replace_with_linear": false,
1350
+ "sparsify": null,
1351
+ "unshifted_sink": false,
1352
+ "use_prefill_window_in_sink_attention": false,
1353
+ "window_length": null
1354
+ },
1355
+ "ffn": {
1356
+ "ffn_mult": 5.25,
1357
+ "no_op": false,
1358
+ "replace_with_linear": false,
1359
+ "sparsify": null
1360
+ }
1361
+ },
1362
+ {
1363
+ "attention": {
1364
+ "n_heads_in_group": 8,
1365
+ "no_op": false,
1366
+ "num_sink_tokens": null,
1367
+ "replace_with_linear": false,
1368
+ "sparsify": null,
1369
+ "unshifted_sink": false,
1370
+ "use_prefill_window_in_sink_attention": false,
1371
+ "window_length": null
1372
+ },
1373
+ "ffn": {
1374
+ "ffn_mult": 5.25,
1375
+ "no_op": false,
1376
+ "replace_with_linear": false,
1377
+ "sparsify": null
1378
+ }
1379
+ },
1380
+ {
1381
+ "attention": {
1382
+ "n_heads_in_group": 8,
1383
+ "no_op": false,
1384
+ "num_sink_tokens": null,
1385
+ "replace_with_linear": false,
1386
+ "sparsify": null,
1387
+ "unshifted_sink": false,
1388
+ "use_prefill_window_in_sink_attention": false,
1389
+ "window_length": null
1390
+ },
1391
+ "ffn": {
1392
+ "ffn_mult": 5.25,
1393
+ "no_op": false,
1394
+ "replace_with_linear": false,
1395
+ "sparsify": null
1396
+ }
1397
+ },
1398
+ {
1399
+ "attention": {
1400
+ "n_heads_in_group": 8,
1401
+ "no_op": false,
1402
+ "num_sink_tokens": null,
1403
+ "replace_with_linear": false,
1404
+ "sparsify": null,
1405
+ "unshifted_sink": false,
1406
+ "use_prefill_window_in_sink_attention": false,
1407
+ "window_length": null
1408
+ },
1409
+ "ffn": {
1410
+ "ffn_mult": 5.25,
1411
+ "no_op": false,
1412
+ "replace_with_linear": false,
1413
+ "sparsify": null
1414
+ }
1415
+ },
1416
+ {
1417
+ "attention": {
1418
+ "n_heads_in_group": 8,
1419
+ "no_op": false,
1420
+ "num_sink_tokens": null,
1421
+ "replace_with_linear": false,
1422
+ "sparsify": null,
1423
+ "unshifted_sink": false,
1424
+ "use_prefill_window_in_sink_attention": false,
1425
+ "window_length": null
1426
+ },
1427
+ "ffn": {
1428
+ "ffn_mult": 5.25,
1429
+ "no_op": false,
1430
+ "replace_with_linear": false,
1431
+ "sparsify": null
1432
+ }
1433
+ },
1434
+ {
1435
+ "attention": {
1436
+ "n_heads_in_group": 8,
1437
+ "no_op": false,
1438
+ "num_sink_tokens": null,
1439
+ "replace_with_linear": false,
1440
+ "sparsify": null,
1441
+ "unshifted_sink": false,
1442
+ "use_prefill_window_in_sink_attention": false,
1443
+ "window_length": null
1444
+ },
1445
+ "ffn": {
1446
+ "ffn_mult": 5.25,
1447
+ "no_op": false,
1448
+ "replace_with_linear": false,
1449
+ "sparsify": null
1450
+ }
1451
+ }
1452
+ ],
1453
+ "bos_token_id": 128000,
1454
+ "eos_token_id": [
1455
+ 128001,
1456
+ 128008,
1457
+ 128009
1458
+ ],
1459
+ "hidden_act": "silu",
1460
+ "hidden_size": 8192,
1461
+ "initializer_range": 0.02,
1462
+ "intermediate_size": null,
1463
+ "max_position_embeddings": 131072,
1464
+ "mlp_bias": false,
1465
+ "model_type": "nemotron-nas",
1466
+ "num_attention_heads": 64,
1467
+ "num_hidden_layers": 80,
1468
+ "num_key_value_heads": null,
1469
+ "pretraining_tp": 1,
1470
+ "quantization": {
1471
+ "group_size": 64,
1472
+ "bits": 8
1473
+ },
1474
+ "quantization_config": {
1475
+ "group_size": 64,
1476
+ "bits": 8
1477
+ },
1478
+ "rms_norm_eps": 1e-05,
1479
+ "rope_scaling": {
1480
+ "factor": 16.0,
1481
+ "high_freq_factor": 4.0,
1482
+ "low_freq_factor": 1.0,
1483
+ "original_max_position_embeddings": 8192,
1484
+ "rope_type": "llama3"
1485
+ },
1486
+ "rope_theta": 500000.0,
1487
+ "tie_word_embeddings": false,
1488
+ "torch_dtype": "bfloat16",
1489
+ "transformers_version": "4.48.3",
1490
+ "use_cache": true,
1491
+ "vocab_size": 128256
1492
+ }