adamkarvonen commited on Feb 4

Commit

4dd38b6

verified ·

1 Parent(s): 35289d3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +32 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +32 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +32 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +32 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +32 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +32 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +28 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +28 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +28 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +28 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +28 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +28 -0
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +29 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +29 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +29 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +29 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +29 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +29 -0
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +53 -0
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json +1 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json +35 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json +1 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +35 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +35 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +35 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +35 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +35 -0

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "k": 20,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 6.8115614688757695, "l1_loss": 52.64818711714311, "l0": 19.92191522771662, "frac_variance_explained": 0.932711478435632, "cossim": 0.9568742730400779, "l2_ratio": 0.9564891547867747, "relative_reconstruction_bias": 1.0002475091905305, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.925948244152647, "loss_zero": 12.187079458525687, "frac_recovered": 0.9643127231886892, "frac_alive": 0.5512237548828125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "k": 40,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 6.042276859283447, "l1_loss": 66.77286783854167, "l0": 39.831973335959695, "frac_variance_explained": 0.9466052254041036, "cossim": 0.9660982634081985, "l2_ratio": 0.9657998157270027, "relative_reconstruction_bias": 0.9996290405591329, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.8093490925702183, "loss_zero": 12.187079458525687, "frac_recovered": 0.9772408442063765, "frac_alive": 0.5221405029296875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "k": 80,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 5.2391165675538955, "l1_loss": 89.92644292658025, "l0": 79.69068376945727, "frac_variance_explained": 0.9595911448652094, "cossim": 0.9746276718197446, "l2_ratio": 0.9746691133036758, "relative_reconstruction_bias": 1.0001698316949788, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.732304092609521, "loss_zero": 12.187079458525687, "frac_recovered": 0.9858564571900801, "frac_alive": 0.5078582763671875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "k": 160,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 4.364285093365294, "l1_loss": 141.87809152314156, "l0": 159.25443799567944, "frac_variance_explained": 0.9720097888599742, "cossim": 0.9825305776162581, "l2_ratio": 0.9826186534130212, "relative_reconstruction_bias": 0.999792527068745, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.679041689092463, "loss_zero": 12.187079458525687, "frac_recovered": 0.9918394431923375, "frac_alive": 0.412506103515625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "k": 320,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 3.0580187060616235, "l1_loss": 284.2038999615294, "l0": 318.5178435354522, "frac_variance_explained": 0.986518079584295, "cossim": 0.9915731350580851, "l2_ratio": 0.9917072816328569, "relative_reconstruction_bias": 1.0007029446688565, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.634615547729261, "loss_zero": 12.187079458525687, "frac_recovered": 0.9968373215559757, "frac_alive": 0.179290771484375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "k": 640,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "BatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 1.457940484538223, "l1_loss": 614.061464251894, "l0": 639.2937807025331, "frac_variance_explained": 0.9967457995270238, "cossim": 0.9980556982936282, "l2_ratio": 0.9979188803470496, "relative_reconstruction_bias": 1.0004513372074475, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.612772403341351, "loss_zero": 12.187079458525687, "frac_recovered": 0.9992812926119025, "frac_alive": 0.019287109375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "trainer": {
+        "dict_class": "GatedAutoEncoder",
+        "trainer_class": "GatedSAETrainer",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "l1_penalty": 0.012,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "seed": 0,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 2.6180462219628944, "l1_loss": 224.72570129762212, "l0": 506.2615560508636, "frac_variance_explained": 0.9901319092296692, "cossim": 0.993862024990909, "l2_ratio": 0.9944691848324005, "relative_reconstruction_bias": 1.0011837999504734, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6131112532443312, "loss_zero": 12.979128625019488, "frac_recovered": 0.9976450305387198, "frac_alive": 0.4523468017578125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "trainer": {
+        "dict_class": "GatedAutoEncoder",
+        "trainer_class": "GatedSAETrainer",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "l1_penalty": 0.018,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "seed": 0,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 3.733102491103023, "l1_loss": 160.86592699533486, "l0": 362.118231899767, "frac_variance_explained": 0.9797103846647653, "cossim": 0.9874217219381447, "l2_ratio": 0.9876558999699282, "relative_reconstruction_bias": 1.000781415456749, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6406399679471213, "loss_zero": 12.979128625019488, "frac_recovered": 0.994674064308764, "frac_alive": 0.6014251708984375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "trainer": {
+        "dict_class": "GatedAutoEncoder",
+        "trainer_class": "GatedSAETrainer",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "l1_penalty": 0.024,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "seed": 0,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 4.420620622405087, "l1_loss": 142.89749053587397, "l0": 234.2815245662827, "frac_variance_explained": 0.9713993664965572, "cossim": 0.9822882794472109, "l2_ratio": 0.9829563304602381, "relative_reconstruction_bias": 1.0009008653192635, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6666263485529336, "loss_zero": 12.979128625019488, "frac_recovered": 0.9918424081371491, "frac_alive": 0.4338531494140625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "trainer": {
+        "dict_class": "GatedAutoEncoder",
+        "trainer_class": "GatedSAETrainer",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "l1_penalty": 0.04,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "seed": 0,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 5.348246614617038, "l1_loss": 92.95999977387578, "l0": 93.34840075940971, "frac_variance_explained": 0.9577777655009764, "cossim": 0.9738572807197111, "l2_ratio": 0.9730504180293486, "relative_reconstruction_bias": 0.9996947261942438, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7255357712148185, "loss_zero": 12.979128625019488, "frac_recovered": 0.9855547730463097, "frac_alive": 0.5794677734375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "trainer": {
+        "dict_class": "GatedAutoEncoder",
+        "trainer_class": "GatedSAETrainer",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "l1_penalty": 0.06,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "seed": 0,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 5.9910652264054995, "l1_loss": 62.50791938046375, "l0": 47.74804623155709, "frac_variance_explained": 0.9470674661268671, "cossim": 0.9670502421367599, "l2_ratio": 0.9660871851156994, "relative_reconstruction_bias": 1.0001960612205139, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7880731490721185, "loss_zero": 12.979128625019488, "frac_recovered": 0.9788411503814789, "frac_alive": 0.619537353515625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "trainer": {
+        "dict_class": "GatedAutoEncoder",
+        "trainer_class": "GatedSAETrainer",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "l1_penalty": 0.08,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "seed": 0,
+        "device": "cuda:1",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "GatedTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:1"
+    }
+}

GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 6.536454628749066, "l1_loss": 54.30702179598521, "l0": 30.933502128325312, "frac_variance_explained": 0.9372849065855325, "cossim": 0.9606502716799816, "l2_ratio": 0.9597143128693822, "relative_reconstruction_bias": 1.0001993872315051, "loss_original": 2.591329812285412, "loss_reconstructed": 2.858764309480966, "loss_zero": 12.979128625019488, "frac_recovered": 0.9712351309247764, "frac_alive": 0.49896240234375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "trainer_class": "JumpReluTrainer",
+        "dict_class": "JumpReluAutoEncoder",
+        "lr": 0.0003,
+        "steps": 244140,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
+        "submodule_name": "resid_post_layer_8",
+        "bandwidth": 0.001,
+        "sparsity_penalty": 1.0,
+        "sparsity_warmup_steps": 5000,
+        "target_l0": 20
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 7.139337657445885, "l1_loss": 59.039245306727395, "l0": 20.30936800715435, "frac_variance_explained": 0.925017165850444, "cossim": 0.9526806782527142, "l2_ratio": 0.9529376367488539, "relative_reconstruction_bias": 1.000274724270924, "loss_original": 2.591329812285412, "loss_reconstructed": 2.969967363110508, "loss_zero": 12.979128625019488, "frac_recovered": 0.959245752498328, "frac_alive": 0.2391815185546875, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "trainer_class": "JumpReluTrainer",
+        "dict_class": "JumpReluAutoEncoder",
+        "lr": 0.0003,
+        "steps": 244140,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
+        "submodule_name": "resid_post_layer_8",
+        "bandwidth": 0.001,
+        "sparsity_penalty": 1.0,
+        "sparsity_warmup_steps": 5000,
+        "target_l0": 40
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 6.315503855785692, "l1_loss": 69.46419074736446, "l0": 38.521164905594055, "frac_variance_explained": 0.9411711089582329, "cossim": 0.9631029457212931, "l2_ratio": 0.9633634108376791, "relative_reconstruction_bias": 1.000237949641354, "loss_original": 2.591329812285412, "loss_reconstructed": 2.8283018778605635, "loss_zero": 12.979128625019488, "frac_recovered": 0.9744724344058209, "frac_alive": 0.2889251708984375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "trainer_class": "JumpReluTrainer",
+        "dict_class": "JumpReluAutoEncoder",
+        "lr": 0.0003,
+        "steps": 244140,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
+        "submodule_name": "resid_post_layer_8",
+        "bandwidth": 0.001,
+        "sparsity_penalty": 1.0,
+        "sparsity_warmup_steps": 5000,
+        "target_l0": 80
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 5.575546784573291, "l1_loss": 87.10252729668674, "l0": 75.86958933451089, "frac_variance_explained": 0.9541435697710657, "cossim": 0.9713434024029467, "l2_ratio": 0.9709410197045429, "relative_reconstruction_bias": 1.000590334455651, "loss_original": 2.591329812285412, "loss_reconstructed": 2.7422697220940186, "loss_zero": 12.979128625019488, "frac_recovered": 0.983704393527594, "frac_alive": 0.25250244140625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "trainer_class": "JumpReluTrainer",
+        "dict_class": "JumpReluAutoEncoder",
+        "lr": 0.0003,
+        "steps": 244140,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
+        "submodule_name": "resid_post_layer_8",
+        "bandwidth": 0.001,
+        "sparsity_penalty": 1.0,
+        "sparsity_warmup_steps": 5000,
+        "target_l0": 160
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 4.9314600588327435, "l1_loss": 116.4271825307823, "l0": 133.42655972400343, "frac_variance_explained": 0.9642347908881773, "cossim": 0.9776697323982975, "l2_ratio": 0.9770763559513781, "relative_reconstruction_bias": 1.0007411794490124, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6929130008421747, "loss_zero": 12.979128625019488, "frac_recovered": 0.9890197440084204, "frac_alive": 0.180206298828125, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "trainer_class": "JumpReluTrainer",
+        "dict_class": "JumpReluAutoEncoder",
+        "lr": 0.0003,
+        "steps": 244140,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
+        "submodule_name": "resid_post_layer_8",
+        "bandwidth": 0.001,
+        "sparsity_penalty": 1.0,
+        "sparsity_warmup_steps": 5000,
+        "target_l0": 320
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 3.5989560319716674, "l1_loss": 205.8147625291204, "l0": 288.3598384627377, "frac_variance_explained": 0.9810939133167267, "cossim": 0.9881818513554262, "l2_ratio": 0.9883791021553867, "relative_reconstruction_bias": 0.9997902907520891, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6320626211453635, "loss_zero": 12.979128625019488, "frac_recovered": 0.9955570661878012, "frac_alive": 0.090667724609375, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "trainer_class": "JumpReluTrainer",
+        "dict_class": "JumpReluAutoEncoder",
+        "lr": 0.0003,
+        "steps": 244140,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "JumpReluTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
+        "submodule_name": "resid_post_layer_8",
+        "bandwidth": 0.001,
+        "sparsity_penalty": 1.0,
+        "sparsity_warmup_steps": 5000,
+        "target_l0": 640
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 3.2377584095460823, "l1_loss": 453.48774222868036, "l0": 601.1393484092621, "frac_variance_explained": 0.9818732688225895, "cossim": 0.9869504359113165, "l2_ratio": 1.0059552444032875, "relative_reconstruction_bias": 1.0114614389028893, "loss_original": 2.591329812285412, "loss_reconstructed": 2.6055084273039575, "loss_zero": 12.979128625019488, "frac_recovered": 0.9984711311667799, "frac_alive": 0.0184478759765625, "hyperparameters": {"n_inputs": 1000, "context_length": 1024}}

MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 3.3245140133482036, "l1_loss": 330.21591741388494, "l0": 318.3319563432173, "frac_variance_explained": 0.9841687444484595, "cossim": 0.9901527003808455, "l2_ratio": 0.9942391442530083, "relative_reconstruction_bias": 1.0023493802908696, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.638994989973126, "loss_zero": 12.187079458525687, "frac_recovered": 0.9963269107269518, "frac_alive": 0.1301422119140625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+    "trainer": {
+        "trainer_class": "MatryoshkaBatchTopKTrainer",
+        "dict_class": "MatryoshkaBatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 384,
+        "seed": 0,
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "group_fractions": [
+            0.03125,
+            0.0625,
+            0.125,
+            0.25,
+            0.53125
+        ],
+        "group_weights": [
+            0.2,
+            0.2,
+            0.2,
+            0.2,
+            0.2
+        ],
+        "group_sizes": [
+            2048,
+            4096,
+            8192,
+            16384,
+            34816
+        ],
+        "k": 640,
+        "device": "cuda:0",
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 1.3593918771454783, "l1_loss": 728.8317464192709, "l0": 639.0425174597538, "frac_variance_explained": 0.9971729137680747, "cossim": 0.9983109615065835, "l2_ratio": 0.9978185458616777, "relative_reconstruction_bias": 1.0002086866985669, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.61249912146366, "loss_zero": 12.187079458525687, "frac_recovered": 0.9993282484285759, "frac_alive": 0.014617919921875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "trainer": {
+        "trainer_class": "PAnnealTrainer",
+        "dict_class": "AutoEncoder",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "sparsity_function": "Lp^p",
+        "sparsity_penalty": 0.006,
+        "p_start": 1.0,
+        "p_end": 0.2,
+        "anneal_start": 10000,
+        "sparsity_queue_length": 10,
+        "n_sparsity_updates": 10,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "resample_steps": null,
+        "steps": 244140,
+        "seed": 0,
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_0",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 3.670030557748043, "l1_loss": 141.251840764826, "l0": 348.4003194173177, "frac_variance_explained": 0.9806513605695782, "cossim": 0.9877766822323655, "l2_ratio": 0.9837823004433603, "relative_reconstruction_bias": 0.9981156695972789, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.6619137381062363, "loss_zero": 12.187079458525687, "frac_recovered": 0.9938127037250635, "frac_alive": 0.3325347900390625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "trainer": {
+        "trainer_class": "PAnnealTrainer",
+        "dict_class": "AutoEncoder",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "sparsity_function": "Lp^p",
+        "sparsity_penalty": 0.008,
+        "p_start": 1.0,
+        "p_end": 0.2,
+        "anneal_start": 10000,
+        "sparsity_queue_length": 10,
+        "n_sparsity_updates": 10,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "resample_steps": null,
+        "steps": 244140,
+        "seed": 0,
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 4.522386738748262, "l1_loss": 105.70531741055575, "l0": 222.64559289180872, "frac_variance_explained": 0.9702411980339976, "cossim": 0.9813343160080187, "l2_ratio": 0.9774827162424723, "relative_reconstruction_bias": 0.9988945054285454, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.700730562210083, "loss_zero": 12.187079458525687, "frac_recovered": 0.9894037842750549, "frac_alive": 0.3546600341796875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "trainer": {
+        "trainer_class": "PAnnealTrainer",
+        "dict_class": "AutoEncoder",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "sparsity_function": "Lp^p",
+        "sparsity_penalty": 0.01,
+        "p_start": 1.0,
+        "p_end": 0.2,
+        "anneal_start": 10000,
+        "sparsity_queue_length": 10,
+        "n_sparsity_updates": 10,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "resample_steps": null,
+        "steps": 244140,
+        "seed": 0,
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 5.095679933374578, "l1_loss": 85.60162006724964, "l0": 147.9398868445194, "frac_variance_explained": 0.9617998256827845, "cossim": 0.9761915622335492, "l2_ratio": 0.9722904725508257, "relative_reconstruction_bias": 0.9986938003337744, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7340729850711245, "loss_zero": 12.187079458525687, "frac_recovered": 0.9857035152839891, "frac_alive": 0.364471435546875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "trainer": {
+        "trainer_class": "PAnnealTrainer",
+        "dict_class": "AutoEncoder",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "sparsity_function": "Lp^p",
+        "sparsity_penalty": 0.015,
+        "p_start": 1.0,
+        "p_end": 0.2,
+        "anneal_start": 10000,
+        "sparsity_queue_length": 10,
+        "n_sparsity_updates": 10,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "resample_steps": null,
+        "steps": 244140,
+        "seed": 0,
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 6.043845350092107, "l1_loss": 59.91714119188713, "l0": 64.48810577392578, "frac_variance_explained": 0.9458111127217611, "cossim": 0.9662979913480354, "l2_ratio": 0.9622128280726346, "relative_reconstruction_bias": 0.9980922232974659, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.8208962353793057, "loss_zero": 12.187079458525687, "frac_recovered": 0.976002908114231, "frac_alive": 0.368621826171875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "trainer": {
+        "trainer_class": "PAnnealTrainer",
+        "dict_class": "AutoEncoder",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "sparsity_function": "Lp^p",
+        "sparsity_penalty": 0.02,
+        "p_start": 1.0,
+        "p_end": 0.2,
+        "anneal_start": 10000,
+        "sparsity_queue_length": 10,
+        "n_sparsity_updates": 10,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "resample_steps": null,
+        "steps": 244140,
+        "seed": 0,
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 6.61760934193929, "l1_loss": 49.408933234937265, "l0": 36.88696647412849, "frac_variance_explained": 0.9357700709140662, "cossim": 0.9595213604695869, "l2_ratio": 0.9548207214384368, "relative_reconstruction_bias": 0.9978785514831543, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.903652989503109, "loss_zero": 12.187079458525687, "frac_recovered": 0.9667834169936903, "frac_alive": 0.3637542724609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "trainer": {
+        "trainer_class": "PAnnealTrainer",
+        "dict_class": "AutoEncoder",
+        "activation_dim": 768,
+        "dict_size": 65536,
+        "lr": 0.0003,
+        "sparsity_function": "Lp^p",
+        "sparsity_penalty": 0.025,
+        "p_start": 1.0,
+        "p_end": 0.2,
+        "anneal_start": 10000,
+        "sparsity_queue_length": 10,
+        "n_sparsity_updates": 10,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "decay_start": 195312,
+        "resample_steps": null,
+        "steps": 244140,
+        "seed": 0,
+        "layer": 8,
+        "lm_name": "EleutherAI/pythia-160m-deduped",
+        "wandb_name": "PAnnealTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
+        "submodule_name": "resid_post_layer_8"
+    },
+    "buffer": {
+        "d_submodule": 768,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 32,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}