mgoin commited on
Commit
44f1710
·
verified ·
1 Parent(s): 5693b5c

Upload folder using huggingface_hub

Browse files
config.json CHANGED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea372538cf59768508624be88431d34477f0f5064d87ff5d4ad40c1a2cd012d3
3
- size 5000641848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d35795482399c22b20165089b381739017b132adeb3208e74f9808da6700ae61
3
+ size 5000680840
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:528a85dedde41455807faf4c52950325f4bf83ff8b609d489725f438d87bf916
3
- size 3311202632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:889d27f737530f2c0b5ab3187f74e33024baa5d25a6362cc15e69f0428c6bfe9
3
+ size 3311230472
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 8310164736
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
@@ -556,9 +556,7 @@
556
  "model.layers.0.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
557
  "model.layers.0.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
558
  "model.layers.0.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
559
- "model.layers.0.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
560
- "model.layers.0.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
561
- "model.layers.0.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
562
  "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
563
  "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
564
  "model.layers.0.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -1126,9 +1124,7 @@
1126
  "model.layers.1.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
1127
  "model.layers.1.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
1128
  "model.layers.1.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
1129
- "model.layers.1.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
1130
- "model.layers.1.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
1131
- "model.layers.1.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
1132
  "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
1133
  "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
1134
  "model.layers.1.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -1696,9 +1692,7 @@
1696
  "model.layers.10.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
1697
  "model.layers.10.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
1698
  "model.layers.10.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
1699
- "model.layers.10.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
1700
- "model.layers.10.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
1701
- "model.layers.10.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
1702
  "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
1703
  "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
1704
  "model.layers.10.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -2266,9 +2260,7 @@
2266
  "model.layers.11.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
2267
  "model.layers.11.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
2268
  "model.layers.11.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
2269
- "model.layers.11.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
2270
- "model.layers.11.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
2271
- "model.layers.11.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
2272
  "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
2273
  "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
2274
  "model.layers.11.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -2836,9 +2828,7 @@
2836
  "model.layers.12.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
2837
  "model.layers.12.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
2838
  "model.layers.12.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
2839
- "model.layers.12.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
2840
- "model.layers.12.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
2841
- "model.layers.12.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
2842
  "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
2843
  "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
2844
  "model.layers.12.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -3406,9 +3396,7 @@
3406
  "model.layers.13.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
3407
  "model.layers.13.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
3408
  "model.layers.13.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
3409
- "model.layers.13.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
3410
- "model.layers.13.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
3411
- "model.layers.13.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
3412
  "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
3413
  "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
3414
  "model.layers.13.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -3976,9 +3964,7 @@
3976
  "model.layers.14.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
3977
  "model.layers.14.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
3978
  "model.layers.14.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
3979
- "model.layers.14.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
3980
- "model.layers.14.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
3981
- "model.layers.14.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
3982
  "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
3983
  "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
3984
  "model.layers.14.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -4546,9 +4532,7 @@
4546
  "model.layers.15.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
4547
  "model.layers.15.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
4548
  "model.layers.15.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
4549
- "model.layers.15.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
4550
- "model.layers.15.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
4551
- "model.layers.15.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
4552
  "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
4553
  "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
4554
  "model.layers.15.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -5116,9 +5100,7 @@
5116
  "model.layers.16.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
5117
  "model.layers.16.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
5118
  "model.layers.16.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
5119
- "model.layers.16.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
5120
- "model.layers.16.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
5121
- "model.layers.16.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
5122
  "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
5123
  "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
5124
  "model.layers.16.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -5686,9 +5668,7 @@
5686
  "model.layers.17.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
5687
  "model.layers.17.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
5688
  "model.layers.17.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
5689
- "model.layers.17.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
5690
- "model.layers.17.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
5691
- "model.layers.17.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
5692
  "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
5693
  "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
5694
  "model.layers.17.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -6256,9 +6236,7 @@
6256
  "model.layers.18.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
6257
  "model.layers.18.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
6258
  "model.layers.18.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
6259
- "model.layers.18.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
6260
- "model.layers.18.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
6261
- "model.layers.18.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
6262
  "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
6263
  "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
6264
  "model.layers.18.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -6826,9 +6804,7 @@
6826
  "model.layers.19.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
6827
  "model.layers.19.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
6828
  "model.layers.19.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
6829
- "model.layers.19.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
6830
- "model.layers.19.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
6831
- "model.layers.19.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
6832
  "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
6833
  "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
6834
  "model.layers.19.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -7396,9 +7372,7 @@
7396
  "model.layers.2.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
7397
  "model.layers.2.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
7398
  "model.layers.2.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
7399
- "model.layers.2.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
7400
- "model.layers.2.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
7401
- "model.layers.2.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
7402
  "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
7403
  "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
7404
  "model.layers.2.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -7966,9 +7940,7 @@
7966
  "model.layers.20.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
7967
  "model.layers.20.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
7968
  "model.layers.20.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
7969
- "model.layers.20.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
7970
- "model.layers.20.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
7971
- "model.layers.20.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
7972
  "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
7973
  "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
7974
  "model.layers.20.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -8536,9 +8508,7 @@
8536
  "model.layers.21.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
8537
  "model.layers.21.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
8538
  "model.layers.21.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
8539
- "model.layers.21.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
8540
- "model.layers.21.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
8541
- "model.layers.21.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
8542
  "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
8543
  "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
8544
  "model.layers.21.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -9106,9 +9076,7 @@
9106
  "model.layers.22.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
9107
  "model.layers.22.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
9108
  "model.layers.22.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
9109
- "model.layers.22.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
9110
- "model.layers.22.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
9111
- "model.layers.22.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
9112
  "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
9113
  "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
9114
  "model.layers.22.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -9676,9 +9644,7 @@
9676
  "model.layers.23.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
9677
  "model.layers.23.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
9678
  "model.layers.23.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
9679
- "model.layers.23.mlp.shared_expert_gate.weight_packed": "model-00002-of-00002.safetensors",
9680
- "model.layers.23.mlp.shared_expert_gate.weight_scale": "model-00002-of-00002.safetensors",
9681
- "model.layers.23.mlp.shared_expert_gate.weight_shape": "model-00002-of-00002.safetensors",
9682
  "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
9683
  "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
9684
  "model.layers.23.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
@@ -10246,9 +10212,7 @@
10246
  "model.layers.3.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
10247
  "model.layers.3.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
10248
  "model.layers.3.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
10249
- "model.layers.3.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
10250
- "model.layers.3.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
10251
- "model.layers.3.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
10252
  "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
10253
  "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
10254
  "model.layers.3.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -10816,9 +10780,7 @@
10816
  "model.layers.4.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
10817
  "model.layers.4.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
10818
  "model.layers.4.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
10819
- "model.layers.4.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
10820
- "model.layers.4.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
10821
- "model.layers.4.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
10822
  "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
10823
  "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
10824
  "model.layers.4.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -11386,9 +11348,7 @@
11386
  "model.layers.5.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
11387
  "model.layers.5.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
11388
  "model.layers.5.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
11389
- "model.layers.5.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
11390
- "model.layers.5.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
11391
- "model.layers.5.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
11392
  "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
11393
  "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
11394
  "model.layers.5.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -11956,9 +11916,7 @@
11956
  "model.layers.6.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
11957
  "model.layers.6.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
11958
  "model.layers.6.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
11959
- "model.layers.6.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
11960
- "model.layers.6.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
11961
- "model.layers.6.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
11962
  "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
11963
  "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
11964
  "model.layers.6.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -12526,9 +12484,7 @@
12526
  "model.layers.7.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
12527
  "model.layers.7.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
12528
  "model.layers.7.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
12529
- "model.layers.7.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
12530
- "model.layers.7.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
12531
- "model.layers.7.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
12532
  "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12533
  "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
12534
  "model.layers.7.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -13096,9 +13052,7 @@
13096
  "model.layers.8.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
13097
  "model.layers.8.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
13098
  "model.layers.8.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
13099
- "model.layers.8.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
13100
- "model.layers.8.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
13101
- "model.layers.8.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
13102
  "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13103
  "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
13104
  "model.layers.8.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
@@ -13666,9 +13620,7 @@
13666
  "model.layers.9.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
13667
  "model.layers.9.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
13668
  "model.layers.9.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
13669
- "model.layers.9.mlp.shared_expert_gate.weight_packed": "model-00001-of-00002.safetensors",
13670
- "model.layers.9.mlp.shared_expert_gate.weight_scale": "model-00001-of-00002.safetensors",
13671
- "model.layers.9.mlp.shared_expert_gate.weight_shape": "model-00001-of-00002.safetensors",
13672
  "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13673
  "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
13674
  "model.layers.9.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 8310237312
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
 
556
  "model.layers.0.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
557
  "model.layers.0.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
558
  "model.layers.0.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
559
+ "model.layers.0.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
560
  "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
561
  "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
562
  "model.layers.0.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
1124
  "model.layers.1.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
1125
  "model.layers.1.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
1126
  "model.layers.1.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
1127
+ "model.layers.1.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
1128
  "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
1129
  "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
1130
  "model.layers.1.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
1692
  "model.layers.10.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
1693
  "model.layers.10.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
1694
  "model.layers.10.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
1695
+ "model.layers.10.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
1696
  "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
1697
  "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
1698
  "model.layers.10.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
2260
  "model.layers.11.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
2261
  "model.layers.11.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
2262
  "model.layers.11.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
2263
+ "model.layers.11.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
2264
  "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
2265
  "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
2266
  "model.layers.11.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
2828
  "model.layers.12.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
2829
  "model.layers.12.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
2830
  "model.layers.12.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
2831
+ "model.layers.12.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
2832
  "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
2833
  "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
2834
  "model.layers.12.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
3396
  "model.layers.13.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
3397
  "model.layers.13.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
3398
  "model.layers.13.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
3399
+ "model.layers.13.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
3400
  "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
3401
  "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
3402
  "model.layers.13.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
3964
  "model.layers.14.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
3965
  "model.layers.14.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
3966
  "model.layers.14.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
3967
+ "model.layers.14.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
3968
  "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
3969
  "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
3970
  "model.layers.14.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
4532
  "model.layers.15.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
4533
  "model.layers.15.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
4534
  "model.layers.15.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
4535
+ "model.layers.15.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
4536
  "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
4537
  "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
4538
  "model.layers.15.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
5100
  "model.layers.16.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
5101
  "model.layers.16.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
5102
  "model.layers.16.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
5103
+ "model.layers.16.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
5104
  "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
5105
  "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
5106
  "model.layers.16.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
5668
  "model.layers.17.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
5669
  "model.layers.17.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
5670
  "model.layers.17.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
5671
+ "model.layers.17.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
5672
  "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
5673
  "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
5674
  "model.layers.17.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
6236
  "model.layers.18.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
6237
  "model.layers.18.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
6238
  "model.layers.18.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
6239
+ "model.layers.18.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
6240
  "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
6241
  "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
6242
  "model.layers.18.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
6804
  "model.layers.19.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
6805
  "model.layers.19.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
6806
  "model.layers.19.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
6807
+ "model.layers.19.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
6808
  "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
6809
  "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
6810
  "model.layers.19.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
7372
  "model.layers.2.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
7373
  "model.layers.2.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
7374
  "model.layers.2.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
7375
+ "model.layers.2.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
7376
  "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
7377
  "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
7378
  "model.layers.2.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
7940
  "model.layers.20.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
7941
  "model.layers.20.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
7942
  "model.layers.20.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
7943
+ "model.layers.20.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
7944
  "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
7945
  "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
7946
  "model.layers.20.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
8508
  "model.layers.21.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
8509
  "model.layers.21.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
8510
  "model.layers.21.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
8511
+ "model.layers.21.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
8512
  "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
8513
  "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
8514
  "model.layers.21.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
9076
  "model.layers.22.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
9077
  "model.layers.22.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
9078
  "model.layers.22.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
9079
+ "model.layers.22.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
9080
  "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
9081
  "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
9082
  "model.layers.22.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
9644
  "model.layers.23.mlp.shared_expert.up_proj.weight_packed": "model-00002-of-00002.safetensors",
9645
  "model.layers.23.mlp.shared_expert.up_proj.weight_scale": "model-00002-of-00002.safetensors",
9646
  "model.layers.23.mlp.shared_expert.up_proj.weight_shape": "model-00002-of-00002.safetensors",
9647
+ "model.layers.23.mlp.shared_expert_gate.weight": "model-00002-of-00002.safetensors",
 
 
9648
  "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
9649
  "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
9650
  "model.layers.23.self_attn.k_proj.weight_packed": "model-00002-of-00002.safetensors",
 
10212
  "model.layers.3.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
10213
  "model.layers.3.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
10214
  "model.layers.3.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
10215
+ "model.layers.3.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
10216
  "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
10217
  "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
10218
  "model.layers.3.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
10780
  "model.layers.4.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
10781
  "model.layers.4.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
10782
  "model.layers.4.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
10783
+ "model.layers.4.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
10784
  "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
10785
  "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
10786
  "model.layers.4.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
11348
  "model.layers.5.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
11349
  "model.layers.5.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
11350
  "model.layers.5.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
11351
+ "model.layers.5.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
11352
  "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
11353
  "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
11354
  "model.layers.5.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
11916
  "model.layers.6.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
11917
  "model.layers.6.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
11918
  "model.layers.6.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
11919
+ "model.layers.6.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
11920
  "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
11921
  "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
11922
  "model.layers.6.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
12484
  "model.layers.7.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
12485
  "model.layers.7.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
12486
  "model.layers.7.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
12487
+ "model.layers.7.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
12488
  "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12489
  "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
12490
  "model.layers.7.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
13052
  "model.layers.8.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
13053
  "model.layers.8.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
13054
  "model.layers.8.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
13055
+ "model.layers.8.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
13056
  "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13057
  "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
13058
  "model.layers.8.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
 
13620
  "model.layers.9.mlp.shared_expert.up_proj.weight_packed": "model-00001-of-00002.safetensors",
13621
  "model.layers.9.mlp.shared_expert.up_proj.weight_scale": "model-00001-of-00002.safetensors",
13622
  "model.layers.9.mlp.shared_expert.up_proj.weight_shape": "model-00001-of-00002.safetensors",
13623
+ "model.layers.9.mlp.shared_expert_gate.weight": "model-00001-of-00002.safetensors",
 
 
13624
  "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13625
  "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
13626
  "model.layers.9.self_attn.k_proj.weight_packed": "model-00001-of-00002.safetensors",
recipe.yaml CHANGED
@@ -3,4 +3,4 @@ DEFAULT_stage:
3
  GPTQModifier:
4
  scheme: W4A16
5
  targets: Linear
6
- ignore: [lm_head, 're:.*mlp.gate$']
 
3
  GPTQModifier:
4
  scheme: W4A16
5
  targets: Linear
6
+ ignore: [lm_head, 're:.*mlp.gate$', 're:.*mlp.shared_expert_gate$']