Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +32 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +28 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +29 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +35 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +35 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +35 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +35 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +35 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +35 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 20,
|
| 17 |
+
"device": "cuda:1",
|
| 18 |
+
"layer": 8,
|
| 19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
|
| 21 |
+
"submodule_name": "resid_post_layer_8"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 768,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 32,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:1"
|
| 31 |
+
}
|
| 32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.8115614688757695, "l1_loss": 52.64818711714311, "l0": 19.92191522771662, "frac_variance_explained": 0.932711478435632, "cossim": 0.9568742730400779, "l2_ratio": 0.9564891547867747, "relative_reconstruction_bias": 1.0002475091905305, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.925948244152647, "loss_zero": 12.187079458525687, "frac_recovered": 0.9643127231886892, "frac_alive": 0.5512237548828125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 40,
|
| 17 |
+
"device": "cuda:1",
|
| 18 |
+
"layer": 8,
|
| 19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
| 21 |
+
"submodule_name": "resid_post_layer_8"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 768,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 32,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:1"
|
| 31 |
+
}
|
| 32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.042276859283447, "l1_loss": 66.77286783854167, "l0": 39.831973335959695, "frac_variance_explained": 0.9466052254041036, "cossim": 0.9660982634081985, "l2_ratio": 0.9657998157270027, "relative_reconstruction_bias": 0.9996290405591329, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.8093490925702183, "loss_zero": 12.187079458525687, "frac_recovered": 0.9772408442063765, "frac_alive": 0.5221405029296875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 80,
|
| 17 |
+
"device": "cuda:1",
|
| 18 |
+
"layer": 8,
|
| 19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
| 21 |
+
"submodule_name": "resid_post_layer_8"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 768,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 32,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:1"
|
| 31 |
+
}
|
| 32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.2391165675538955, "l1_loss": 89.92644292658025, "l0": 79.69068376945727, "frac_variance_explained": 0.9595911448652094, "cossim": 0.9746276718197446, "l2_ratio": 0.9746691133036758, "relative_reconstruction_bias": 1.0001698316949788, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.732304092609521, "loss_zero": 12.187079458525687, "frac_recovered": 0.9858564571900801, "frac_alive": 0.5078582763671875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 160,
|
| 17 |
+
"device": "cuda:1",
|
| 18 |
+
"layer": 8,
|
| 19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
| 21 |
+
"submodule_name": "resid_post_layer_8"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 768,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 32,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:1"
|
| 31 |
+
}
|
| 32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 4.364285093365294, "l1_loss": 141.87809152314156, "l0": 159.25443799567944, "frac_variance_explained": 0.9720097888599742, "cossim": 0.9825305776162581, "l2_ratio": 0.9826186534130212, "relative_reconstruction_bias": 0.999792527068745, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.679041689092463, "loss_zero": 12.187079458525687, "frac_recovered": 0.9918394431923375, "frac_alive": 0.412506103515625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 320,
|
| 17 |
+
"device": "cuda:1",
|
| 18 |
+
"layer": 8,
|
| 19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
| 21 |
+
"submodule_name": "resid_post_layer_8"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 768,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 32,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:1"
|
| 31 |
+
}
|
| 32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.0580187060616235, "l1_loss": 284.2038999615294, "l0": 318.5178435354522, "frac_variance_explained": 0.986518079584295, "cossim": 0.9915731350580851, "l2_ratio": 0.9917072816328569, "relative_reconstruction_bias": 1.0007029446688565, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.634615547729261, "loss_zero": 12.187079458525687, "frac_recovered": 0.9968373215559757, "frac_alive": 0.179290771484375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 640,
|
| 17 |
+
"device": "cuda:1",
|
| 18 |
+
"layer": 8,
|
| 19 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
| 21 |
+
"submodule_name": "resid_post_layer_8"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 768,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 32,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:1"
|
| 31 |
+
}
|
| 32 |
+
}
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 1.457940484538223, "l1_loss": 614.061464251894, "l0": 639.2937807025331, "frac_variance_explained": 0.9967457995270238, "cossim": 0.9980556982936282, "l2_ratio": 0.9979188803470496, "relative_reconstruction_bias": 1.0004513372074475, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.612772403341351, "loss_zero": 12.187079458525687, "frac_recovered": 0.9992812926119025, "frac_alive": 0.019287109375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.012,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 0,
|
| 13 |
+
"device": "cuda:1",
|
| 14 |
+
"layer": 8,
|
| 15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
|
| 17 |
+
"submodule_name": "resid_post_layer_8"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 768,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 244,
|
| 23 |
+
"ctx_len": 1024,
|
| 24 |
+
"refresh_batch_size": 32,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:1"
|
| 27 |
+
}
|
| 28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 2.6180462219628944, "l1_loss": 224.72570129762212, "l0": 506.2615560508636, "frac_variance_explained": 0.9901319092296692, "cossim": 0.993862024990909, "l2_ratio": 0.9944691848324005, "relative_reconstruction_bias": 1.0011837999504734, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6131112532443312, "loss_zero": 12.979128625019488, "frac_recovered": 0.9976450305387198, "frac_alive": 0.4523468017578125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.018,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 0,
|
| 13 |
+
"device": "cuda:1",
|
| 14 |
+
"layer": 8,
|
| 15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
| 17 |
+
"submodule_name": "resid_post_layer_8"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 768,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 244,
|
| 23 |
+
"ctx_len": 1024,
|
| 24 |
+
"refresh_batch_size": 32,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:1"
|
| 27 |
+
}
|
| 28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.733102491103023, "l1_loss": 160.86592699533486, "l0": 362.118231899767, "frac_variance_explained": 0.9797103846647653, "cossim": 0.9874217219381447, "l2_ratio": 0.9876558999699282, "relative_reconstruction_bias": 1.000781415456749, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6406399679471213, "loss_zero": 12.979128625019488, "frac_recovered": 0.994674064308764, "frac_alive": 0.6014251708984375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.024,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 0,
|
| 13 |
+
"device": "cuda:1",
|
| 14 |
+
"layer": 8,
|
| 15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
| 17 |
+
"submodule_name": "resid_post_layer_8"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 768,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 244,
|
| 23 |
+
"ctx_len": 1024,
|
| 24 |
+
"refresh_batch_size": 32,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:1"
|
| 27 |
+
}
|
| 28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 4.420620622405087, "l1_loss": 142.89749053587397, "l0": 234.2815245662827, "frac_variance_explained": 0.9713993664965572, "cossim": 0.9822882794472109, "l2_ratio": 0.9829563304602381, "relative_reconstruction_bias": 1.0009008653192635, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6666263485529336, "loss_zero": 12.979128625019488, "frac_recovered": 0.9918424081371491, "frac_alive": 0.4338531494140625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.04,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 0,
|
| 13 |
+
"device": "cuda:1",
|
| 14 |
+
"layer": 8,
|
| 15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
| 17 |
+
"submodule_name": "resid_post_layer_8"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 768,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 244,
|
| 23 |
+
"ctx_len": 1024,
|
| 24 |
+
"refresh_batch_size": 32,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:1"
|
| 27 |
+
}
|
| 28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.348246614617038, "l1_loss": 92.95999977387578, "l0": 93.34840075940971, "frac_variance_explained": 0.9577777655009764, "cossim": 0.9738572807197111, "l2_ratio": 0.9730504180293486, "relative_reconstruction_bias": 0.9996947261942438, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7255357712148185, "loss_zero": 12.979128625019488, "frac_recovered": 0.9855547730463097, "frac_alive": 0.5794677734375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.06,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 0,
|
| 13 |
+
"device": "cuda:1",
|
| 14 |
+
"layer": 8,
|
| 15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
| 17 |
+
"submodule_name": "resid_post_layer_8"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 768,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 244,
|
| 23 |
+
"ctx_len": 1024,
|
| 24 |
+
"refresh_batch_size": 32,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:1"
|
| 27 |
+
}
|
| 28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.9910652264054995, "l1_loss": 62.50791938046375, "l0": 47.74804623155709, "frac_variance_explained": 0.9470674661268671, "cossim": 0.9670502421367599, "l2_ratio": 0.9660871851156994, "relative_reconstruction_bias": 1.0001960612205139, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7880731490721185, "loss_zero": 12.979128625019488, "frac_recovered": 0.9788411503814789, "frac_alive": 0.619537353515625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.08,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 0,
|
| 13 |
+
"device": "cuda:1",
|
| 14 |
+
"layer": 8,
|
| 15 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 16 |
+
"wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
| 17 |
+
"submodule_name": "resid_post_layer_8"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 768,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 244,
|
| 23 |
+
"ctx_len": 1024,
|
| 24 |
+
"refresh_batch_size": 32,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:1"
|
| 27 |
+
}
|
| 28 |
+
}
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.536454628749066, "l1_loss": 54.30702179598521, "l0": 30.933502128325312, "frac_variance_explained": 0.9372849065855325, "cossim": 0.9606502716799816, "l2_ratio": 0.9597143128693822, "relative_reconstruction_bias": 1.0001993872315051, "loss_original": 2.591329812285412, "loss_reconstructed": 2.858764309480966, "loss_zero": 12.979128625019488, "frac_recovered": 0.9712351309247764, "frac_alive": 0.49896240234375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"activation_dim": 768,
|
| 9 |
+
"dict_size": 65536,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 8,
|
| 12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
|
| 14 |
+
"submodule_name": "resid_post_layer_8",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 20
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 768,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 244,
|
| 24 |
+
"ctx_len": 1024,
|
| 25 |
+
"refresh_batch_size": 32,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 7.139337657445885, "l1_loss": 59.039245306727395, "l0": 20.30936800715435, "frac_variance_explained": 0.925017165850444, "cossim": 0.9526806782527142, "l2_ratio": 0.9529376367488539, "relative_reconstruction_bias": 1.000274724270924, "loss_original": 2.591329812285412, "loss_reconstructed": 2.969967363110508, "loss_zero": 12.979128625019488, "frac_recovered": 0.959245752498328, "frac_alive": 0.2391815185546875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"activation_dim": 768,
|
| 9 |
+
"dict_size": 65536,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 8,
|
| 12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
| 14 |
+
"submodule_name": "resid_post_layer_8",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 40
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 768,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 244,
|
| 24 |
+
"ctx_len": 1024,
|
| 25 |
+
"refresh_batch_size": 32,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.315503855785692, "l1_loss": 69.46419074736446, "l0": 38.521164905594055, "frac_variance_explained": 0.9411711089582329, "cossim": 0.9631029457212931, "l2_ratio": 0.9633634108376791, "relative_reconstruction_bias": 1.000237949641354, "loss_original": 2.591329812285412, "loss_reconstructed": 2.8283018778605635, "loss_zero": 12.979128625019488, "frac_recovered": 0.9744724344058209, "frac_alive": 0.2889251708984375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"activation_dim": 768,
|
| 9 |
+
"dict_size": 65536,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 8,
|
| 12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
| 14 |
+
"submodule_name": "resid_post_layer_8",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 80
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 768,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 244,
|
| 24 |
+
"ctx_len": 1024,
|
| 25 |
+
"refresh_batch_size": 32,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.575546784573291, "l1_loss": 87.10252729668674, "l0": 75.86958933451089, "frac_variance_explained": 0.9541435697710657, "cossim": 0.9713434024029467, "l2_ratio": 0.9709410197045429, "relative_reconstruction_bias": 1.000590334455651, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7422697220940186, "loss_zero": 12.979128625019488, "frac_recovered": 0.983704393527594, "frac_alive": 0.25250244140625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"activation_dim": 768,
|
| 9 |
+
"dict_size": 65536,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 8,
|
| 12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
| 14 |
+
"submodule_name": "resid_post_layer_8",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 160
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 768,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 244,
|
| 24 |
+
"ctx_len": 1024,
|
| 25 |
+
"refresh_batch_size": 32,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 4.9314600588327435, "l1_loss": 116.4271825307823, "l0": 133.42655972400343, "frac_variance_explained": 0.9642347908881773, "cossim": 0.9776697323982975, "l2_ratio": 0.9770763559513781, "relative_reconstruction_bias": 1.0007411794490124, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6929130008421747, "loss_zero": 12.979128625019488, "frac_recovered": 0.9890197440084204, "frac_alive": 0.180206298828125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"activation_dim": 768,
|
| 9 |
+
"dict_size": 65536,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 8,
|
| 12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
| 14 |
+
"submodule_name": "resid_post_layer_8",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 320
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 768,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 244,
|
| 24 |
+
"ctx_len": 1024,
|
| 25 |
+
"refresh_batch_size": 32,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.5989560319716674, "l1_loss": 205.8147625291204, "l0": 288.3598384627377, "frac_variance_explained": 0.9810939133167267, "cossim": 0.9881818513554262, "l2_ratio": 0.9883791021553867, "relative_reconstruction_bias": 0.9997902907520891, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6320626211453635, "loss_zero": 12.979128625019488, "frac_recovered": 0.9955570661878012, "frac_alive": 0.090667724609375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"activation_dim": 768,
|
| 9 |
+
"dict_size": 65536,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 8,
|
| 12 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
| 14 |
+
"submodule_name": "resid_post_layer_8",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 640
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 768,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 244,
|
| 24 |
+
"ctx_len": 1024,
|
| 25 |
+
"refresh_batch_size": 32,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.2377584095460823, "l1_loss": 453.48774222868036, "l0": 601.1393484092621, "frac_variance_explained": 0.9818732688225895, "cossim": 0.9869504359113165, "l2_ratio": 1.0059552444032875, "relative_reconstruction_bias": 1.0114614389028893, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6055084273039575, "loss_zero": 12.979128625019488, "frac_recovered": 0.9984711311667799, "frac_alive": 0.0184478759765625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.3245140133482036, "l1_loss": 330.21591741388494, "l0": 318.3319563432173, "frac_variance_explained": 0.9841687444484595, "cossim": 0.9901527003808455, "l2_ratio": 0.9942391442530083, "relative_reconstruction_bias": 1.0023493802908696, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.638994989973126, "loss_zero": 12.187079458525687, "frac_recovered": 0.9963269107269518, "frac_alive": 0.1301422119140625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 640,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 1.3593918771454783, "l1_loss": 728.8317464192709, "l0": 639.0425174597538, "frac_variance_explained": 0.9971729137680747, "cossim": 0.9983109615065835, "l2_ratio": 0.9978185458616777, "relative_reconstruction_bias": 1.0002086866985669, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.61249912146366, "loss_zero": 12.187079458525687, "frac_recovered": 0.9993282484285759, "frac_alive": 0.014617919921875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "PAnnealTrainer",
|
| 4 |
+
"dict_class": "AutoEncoder",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"sparsity_function": "Lp^p",
|
| 9 |
+
"sparsity_penalty": 0.006,
|
| 10 |
+
"p_start": 1.0,
|
| 11 |
+
"p_end": 0.2,
|
| 12 |
+
"anneal_start": 10000,
|
| 13 |
+
"sparsity_queue_length": 10,
|
| 14 |
+
"n_sparsity_updates": 10,
|
| 15 |
+
"warmup_steps": 1000,
|
| 16 |
+
"sparsity_warmup_steps": 5000,
|
| 17 |
+
"decay_start": 195312,
|
| 18 |
+
"resample_steps": null,
|
| 19 |
+
"steps": 244140,
|
| 20 |
+
"seed": 0,
|
| 21 |
+
"layer": 8,
|
| 22 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 23 |
+
"wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
|
| 24 |
+
"submodule_name": "resid_post_layer_8"
|
| 25 |
+
},
|
| 26 |
+
"buffer": {
|
| 27 |
+
"d_submodule": 768,
|
| 28 |
+
"io": "out",
|
| 29 |
+
"n_ctxs": 244,
|
| 30 |
+
"ctx_len": 1024,
|
| 31 |
+
"refresh_batch_size": 32,
|
| 32 |
+
"out_batch_size": 2048,
|
| 33 |
+
"device": "cuda:0"
|
| 34 |
+
}
|
| 35 |
+
}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.670030557748043, "l1_loss": 141.251840764826, "l0": 348.4003194173177, "frac_variance_explained": 0.9806513605695782, "cossim": 0.9877766822323655, "l2_ratio": 0.9837823004433603, "relative_reconstruction_bias": 0.9981156695972789, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6619137381062363, "loss_zero": 12.187079458525687, "frac_recovered": 0.9938127037250635, "frac_alive": 0.3325347900390625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "PAnnealTrainer",
|
| 4 |
+
"dict_class": "AutoEncoder",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"sparsity_function": "Lp^p",
|
| 9 |
+
"sparsity_penalty": 0.008,
|
| 10 |
+
"p_start": 1.0,
|
| 11 |
+
"p_end": 0.2,
|
| 12 |
+
"anneal_start": 10000,
|
| 13 |
+
"sparsity_queue_length": 10,
|
| 14 |
+
"n_sparsity_updates": 10,
|
| 15 |
+
"warmup_steps": 1000,
|
| 16 |
+
"sparsity_warmup_steps": 5000,
|
| 17 |
+
"decay_start": 195312,
|
| 18 |
+
"resample_steps": null,
|
| 19 |
+
"steps": 244140,
|
| 20 |
+
"seed": 0,
|
| 21 |
+
"layer": 8,
|
| 22 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 23 |
+
"wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
| 24 |
+
"submodule_name": "resid_post_layer_8"
|
| 25 |
+
},
|
| 26 |
+
"buffer": {
|
| 27 |
+
"d_submodule": 768,
|
| 28 |
+
"io": "out",
|
| 29 |
+
"n_ctxs": 244,
|
| 30 |
+
"ctx_len": 1024,
|
| 31 |
+
"refresh_batch_size": 32,
|
| 32 |
+
"out_batch_size": 2048,
|
| 33 |
+
"device": "cuda:0"
|
| 34 |
+
}
|
| 35 |
+
}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 4.522386738748262, "l1_loss": 105.70531741055575, "l0": 222.64559289180872, "frac_variance_explained": 0.9702411980339976, "cossim": 0.9813343160080187, "l2_ratio": 0.9774827162424723, "relative_reconstruction_bias": 0.9988945054285454, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.700730562210083, "loss_zero": 12.187079458525687, "frac_recovered": 0.9894037842750549, "frac_alive": 0.3546600341796875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "PAnnealTrainer",
|
| 4 |
+
"dict_class": "AutoEncoder",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"sparsity_function": "Lp^p",
|
| 9 |
+
"sparsity_penalty": 0.01,
|
| 10 |
+
"p_start": 1.0,
|
| 11 |
+
"p_end": 0.2,
|
| 12 |
+
"anneal_start": 10000,
|
| 13 |
+
"sparsity_queue_length": 10,
|
| 14 |
+
"n_sparsity_updates": 10,
|
| 15 |
+
"warmup_steps": 1000,
|
| 16 |
+
"sparsity_warmup_steps": 5000,
|
| 17 |
+
"decay_start": 195312,
|
| 18 |
+
"resample_steps": null,
|
| 19 |
+
"steps": 244140,
|
| 20 |
+
"seed": 0,
|
| 21 |
+
"layer": 8,
|
| 22 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 23 |
+
"wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
| 24 |
+
"submodule_name": "resid_post_layer_8"
|
| 25 |
+
},
|
| 26 |
+
"buffer": {
|
| 27 |
+
"d_submodule": 768,
|
| 28 |
+
"io": "out",
|
| 29 |
+
"n_ctxs": 244,
|
| 30 |
+
"ctx_len": 1024,
|
| 31 |
+
"refresh_batch_size": 32,
|
| 32 |
+
"out_batch_size": 2048,
|
| 33 |
+
"device": "cuda:0"
|
| 34 |
+
}
|
| 35 |
+
}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.095679933374578, "l1_loss": 85.60162006724964, "l0": 147.9398868445194, "frac_variance_explained": 0.9617998256827845, "cossim": 0.9761915622335492, "l2_ratio": 0.9722904725508257, "relative_reconstruction_bias": 0.9986938003337744, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7340729850711245, "loss_zero": 12.187079458525687, "frac_recovered": 0.9857035152839891, "frac_alive": 0.364471435546875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "PAnnealTrainer",
|
| 4 |
+
"dict_class": "AutoEncoder",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"sparsity_function": "Lp^p",
|
| 9 |
+
"sparsity_penalty": 0.015,
|
| 10 |
+
"p_start": 1.0,
|
| 11 |
+
"p_end": 0.2,
|
| 12 |
+
"anneal_start": 10000,
|
| 13 |
+
"sparsity_queue_length": 10,
|
| 14 |
+
"n_sparsity_updates": 10,
|
| 15 |
+
"warmup_steps": 1000,
|
| 16 |
+
"sparsity_warmup_steps": 5000,
|
| 17 |
+
"decay_start": 195312,
|
| 18 |
+
"resample_steps": null,
|
| 19 |
+
"steps": 244140,
|
| 20 |
+
"seed": 0,
|
| 21 |
+
"layer": 8,
|
| 22 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 23 |
+
"wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
| 24 |
+
"submodule_name": "resid_post_layer_8"
|
| 25 |
+
},
|
| 26 |
+
"buffer": {
|
| 27 |
+
"d_submodule": 768,
|
| 28 |
+
"io": "out",
|
| 29 |
+
"n_ctxs": 244,
|
| 30 |
+
"ctx_len": 1024,
|
| 31 |
+
"refresh_batch_size": 32,
|
| 32 |
+
"out_batch_size": 2048,
|
| 33 |
+
"device": "cuda:0"
|
| 34 |
+
}
|
| 35 |
+
}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.043845350092107, "l1_loss": 59.91714119188713, "l0": 64.48810577392578, "frac_variance_explained": 0.9458111127217611, "cossim": 0.9662979913480354, "l2_ratio": 0.9622128280726346, "relative_reconstruction_bias": 0.9980922232974659, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.8208962353793057, "loss_zero": 12.187079458525687, "frac_recovered": 0.976002908114231, "frac_alive": 0.368621826171875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "PAnnealTrainer",
|
| 4 |
+
"dict_class": "AutoEncoder",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"sparsity_function": "Lp^p",
|
| 9 |
+
"sparsity_penalty": 0.02,
|
| 10 |
+
"p_start": 1.0,
|
| 11 |
+
"p_end": 0.2,
|
| 12 |
+
"anneal_start": 10000,
|
| 13 |
+
"sparsity_queue_length": 10,
|
| 14 |
+
"n_sparsity_updates": 10,
|
| 15 |
+
"warmup_steps": 1000,
|
| 16 |
+
"sparsity_warmup_steps": 5000,
|
| 17 |
+
"decay_start": 195312,
|
| 18 |
+
"resample_steps": null,
|
| 19 |
+
"steps": 244140,
|
| 20 |
+
"seed": 0,
|
| 21 |
+
"layer": 8,
|
| 22 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 23 |
+
"wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
| 24 |
+
"submodule_name": "resid_post_layer_8"
|
| 25 |
+
},
|
| 26 |
+
"buffer": {
|
| 27 |
+
"d_submodule": 768,
|
| 28 |
+
"io": "out",
|
| 29 |
+
"n_ctxs": 244,
|
| 30 |
+
"ctx_len": 1024,
|
| 31 |
+
"refresh_batch_size": 32,
|
| 32 |
+
"out_batch_size": 2048,
|
| 33 |
+
"device": "cuda:0"
|
| 34 |
+
}
|
| 35 |
+
}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.61760934193929, "l1_loss": 49.408933234937265, "l0": 36.88696647412849, "frac_variance_explained": 0.9357700709140662, "cossim": 0.9595213604695869, "l2_ratio": 0.9548207214384368, "relative_reconstruction_bias": 0.9978785514831543, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.903652989503109, "loss_zero": 12.187079458525687, "frac_recovered": 0.9667834169936903, "frac_alive": 0.3637542724609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "PAnnealTrainer",
|
| 4 |
+
"dict_class": "AutoEncoder",
|
| 5 |
+
"activation_dim": 768,
|
| 6 |
+
"dict_size": 65536,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"sparsity_function": "Lp^p",
|
| 9 |
+
"sparsity_penalty": 0.025,
|
| 10 |
+
"p_start": 1.0,
|
| 11 |
+
"p_end": 0.2,
|
| 12 |
+
"anneal_start": 10000,
|
| 13 |
+
"sparsity_queue_length": 10,
|
| 14 |
+
"n_sparsity_updates": 10,
|
| 15 |
+
"warmup_steps": 1000,
|
| 16 |
+
"sparsity_warmup_steps": 5000,
|
| 17 |
+
"decay_start": 195312,
|
| 18 |
+
"resample_steps": null,
|
| 19 |
+
"steps": 244140,
|
| 20 |
+
"seed": 0,
|
| 21 |
+
"layer": 8,
|
| 22 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 23 |
+
"wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
| 24 |
+
"submodule_name": "resid_post_layer_8"
|
| 25 |
+
},
|
| 26 |
+
"buffer": {
|
| 27 |
+
"d_submodule": 768,
|
| 28 |
+
"io": "out",
|
| 29 |
+
"n_ctxs": 244,
|
| 30 |
+
"ctx_len": 1024,
|
| 31 |
+
"refresh_batch_size": 32,
|
| 32 |
+
"out_batch_size": 2048,
|
| 33 |
+
"device": "cuda:0"
|
| 34 |
+
}
|
| 35 |
+
}
|