Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92e889ae2100e3a454aef7a21f08602f4abf24bc251dfa7fa6efc6079f217d8b
|
| 3 |
+
size 402920470
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6fa74304d4b331eb1f581b885b9f3df33dd140df553e8e3ce2dd5e213514593
|
| 3 |
+
size 402920470
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4fb934629636441e02cf7af7b0d439237e1d42e362da599f3ba005939d04f69c
|
| 3 |
+
size 402920470
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86ab3e1887bde228099636fbe868c580c5f131162a9805e1df38ab3f414da93d
|
| 3 |
+
size 402920470
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:300eec2a85fc84fe6a6898c1f68a10ce4c812cde9598729f8f06e43238833b22
|
| 3 |
+
size 402920470
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4aa3157ed8a9b1bb0f22ad802298a3db2c6311aa8763d4c17898fe856b2199b
|
| 3 |
+
size 402920470
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca9e29b47344005be867a37e899ef8680767c8d52f74b82cea6b1348ec8bfe31
|
| 3 |
+
size 403444758
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80a74112f952844d40b7379aeb5d00a168da62d8fd2ac6b9c95da5067385f1ef
|
| 3 |
+
size 403444758
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09652ea312d5eb1ac79109adf6e161146ede32656294d3cb812b909917e95898
|
| 3 |
+
size 403444758
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02864f5049818d9aaaca08d508566eb4bb0c2ee9822e9d55e20be7a8b45de99c
|
| 3 |
+
size 403444758
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a71414125a875e59acf07114d61390cec0a3919a633fd06f4e3b40a032658395
|
| 3 |
+
size 403444758
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:791df5c3501a51a40f2b8dc136aecc18636d096decb401761a825fca98e09c3a
|
| 3 |
+
size 403444758
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca6b34359dacb91df5944f29cff61978dac1df618b3be527705282afac5e9fcf
|
| 3 |
+
size 403182431
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:446f7f62641dddd439cf884d1eb546ca068897c521805b7cbbe5dad26f72e88b
|
| 3 |
+
size 403182431
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78b71c1673165c4f3b17dd96ce760d2e571b9ae363e2e2e7cb8cf6284537b068
|
| 3 |
+
size 403182431
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:059c46bf1aa9c0c9ae0f751d9ec64a82b90cae2f53bf640b7ddddcbb0be961f6
|
| 3 |
+
size 403182431
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4c7b8673c9c75afe0c0bce8672b4215921f85cbdbdd1d4e4b41f093760aaf06
|
| 3 |
+
size 403182431
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb0337c0a94a85d4ed3a0e67f2e985960d0fcb872317ce809ac3e4ae1efb862f
|
| 3 |
+
size 403182431
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94d7d4f0936be97fff0771c502fbb324a479de4062484c3abad7a76e013b39e2
|
| 3 |
+
size 402920717
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 20,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 7.06076750610814, "l1_loss": 55.502148946126304, "l0": 19.923000162298028, "frac_variance_explained": 0.9277354551084114, "cossim": 0.9538878672050707, "l2_ratio": 0.9604759089874498, "relative_reconstruction_bias": 1.0046033064524333, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.945203484910907, "loss_zero": 12.187079458525687, "frac_recovered": 0.9620419469746676, "frac_alive": 0.600677490234375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8bbc9e48d3bca0cd9dc0b7d02dd482e1885ac4abde23cfc382e419330e048ce5
|
| 3 |
+
size 402920717
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 40,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.194424441366484, "l1_loss": 85.02606224291253, "l0": 39.807070992209695, "frac_variance_explained": 0.9439049525694414, "cossim": 0.964661079825777, "l2_ratio": 0.9720291910749493, "relative_reconstruction_bias": 1.0047995177182285, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.8184412428826997, "loss_zero": 12.187079458525687, "frac_recovered": 0.9762207269668579, "frac_alive": 0.653564453125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ff82ae2e3d70c46cbdf8775af63dcd7d569d9644fb6ffb24fafa5e240605b46
|
| 3 |
+
size 402920717
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.450232867038611, "l1_loss": 107.3414394494259, "l0": 79.58225065289122, "frac_variance_explained": 0.956367785280401, "cossim": 0.972661397673867, "l2_ratio": 0.9780536167549364, "relative_reconstruction_bias": 1.004194028449781, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.745052102840308, "loss_zero": 12.187079458525687, "frac_recovered": 0.9844544566038883, "frac_alive": 0.56951904296875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb6aa56ceac048dce5fef95c3876c67dd150b7656da902fc0dff4cc341f68bd1
|
| 3 |
+
size 402920717
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 4.619117390025746, "l1_loss": 190.5597150398023, "l0": 159.13051304672703, "frac_variance_explained": 0.9687698682149252, "cossim": 0.9805189949093442, "l2_ratio": 0.985734917900779, "relative_reconstruction_bias": 1.0036569039026897, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6877589045148906, "loss_zero": 12.187079458525687, "frac_recovered": 0.9908258788513414, "frac_alive": 0.4089202880859375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ca6d6ffcf625867dc63240e8bed8d923e60166cc1c335f54cbf93617b310d17
|
| 3 |
+
size 402920717
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 320,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7a4a858562064de91a6225081914e0096b6ad110c29e57b12cad1bc6f14ff68
|
| 3 |
+
size 402920717
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2cf8f9fea7c1f01ea5b9d2f1edc87c3d7f588ec5e4871f41a5c28ec0890f3cf
|
| 3 |
+
size 402920104
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de9b42b83893a64fcb8e5eae72e35ac180349873730833aca4bfb19888dad8eb
|
| 3 |
+
size 402920104
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e13bb4423307fcfa3ba1820491236e65882ae795fabc44418ecc4894a2fa5dff
|
| 3 |
+
size 402920104
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2698654939c2890c0b1c27001608a2db5689505a4a3905b1f8491cfe3ff1149e
|
| 3 |
+
size 402920104
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45907780c89994187635465306183d9ea47ab14ae84892436b5de85eb47c3490
|
| 3 |
+
size 402920104
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8856a08dd880d51d4a83e9deec591f7b2b146f6817ab2805c39a1c4b5b3f4177
|
| 3 |
+
size 402920104
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1ae2cd98daffefa25d143e34213cab1f94575b2795040c685e1adeb68a2dc08
|
| 3 |
+
size 402920104
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b802595dc84c9f825edb925906317a7478e5908c8fd07ac27c396d3b53ea1663
|
| 3 |
+
size 402920104
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ac53f97bcadf146499cef9567c5483f1d9727df40dde27f6db8a2f16c68ed77
|
| 3 |
+
size 402920104
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2e0326a5b0ac5c55c5e9fa2f5944f16cf63b9171c2c226a938a560ea38d1cb5
|
| 3 |
+
size 402920104
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2caf6669bb28aa94aac807d63cb99635498b2cc6a679b9bd1b4886b508266b45
|
| 3 |
+
size 402920104
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2461da6d96f14205c55aa144795118725624306d21f285688d7abcd7988d4c90
|
| 3 |
+
size 402920104
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cb8cd8832679e1a3747bcd5ad613368b6c794761b1f73dfc597fdb04f110dbb
|
| 3 |
+
size 402920470
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b69de5d93bc1239a1738955d03bcf8902638535f476b2a56792a32da64e1d4f7
|
| 3 |
+
size 402920470
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3516579d32eca10d40a3c5ca9568b08174d8204ddfe9e667f3948b83d1970496
|
| 3 |
+
size 402920470
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a86abe75db90dcbf93e011fe7422d3549d85ae2c258031e65c9d901484e363f8
|
| 3 |
+
size 402920470
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe1d755917f772c2ad2d44de69a78196ca1671f778d64f70e85d576757dd8a14
|
| 3 |
+
size 402920470
|