from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("blimp", "acc", "BLiMP") task1 = Task("blimp_supplement", "acc", "BLiMP Supplement") task2 = Task("glue", "acc", "(Super)GLUE") task3 = Task("ewok", "acc", "EWoK") class TasksMultimodal(Enum): task0 = Task("blimp", "acc", "BLiMP") task1 = Task("blimp_supplement", "acc", "BLiMP Supplement") task2 = Task("glue", "acc", "(Super)GLUE") task3 = Task("ewok", "acc", "EWoK") task4 = Task("vqa", "acc", "VQA") task5 = Task("winoground", "acc", "Winoground") task6 = Task("devbench", "acc", "DevBench") @dataclass class TaskMIB_Subgraph: benchmark: str # task name in json (ioi/arithmetic) models: list[str] # list of models to show as sub-columns col_name: str # display name in leaderboard metrics: list[str] # metrics to store (edge_counts, faithfulness) class TasksMib_Subgraph(Enum): task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"]) task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"]) task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"]) task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"]) task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"]) task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"]) @classmethod def get_all_tasks(cls): """Returns a list of all task benchmarks""" return [task.value.benchmark for task in cls] @classmethod def get_all_models(cls): """Returns a list of all unique models across all tasks""" models = set() for task in cls: models.update(task.value.models) return sorted(list(models)) # @dataclass # class TaskMIB_Causalgraph: # benchmark: str # models: list[str] # layers: dict[str, list[str]] # Different layers for each model # col_name: str # interventions: list[str] # counterfactuals: list[str] # metrics: list[str] # class TasksMib_Causalgraph(Enum): # task0 = TaskMIB_Causalgraph("MCQA", # ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # { # "qwen2forcausallm": [str(i) for i in range(24)], # 0-23 # "gemma2forcausallm": [str(i) for i in range(26)], # 0-25 # "llamaforcausallm": [str(i) for i in range(32)] # 0-31 # }, # "mcqa", # ["output_token", "output_location"], # ["randomLetter_counterfactual", "answerPosition_counterfactual", # "answerPosition_randomLetter_counterfactual"], # ["score"] # ) @dataclass class TaskMIB_Causalgraph: benchmark: str # task name in json (ioi/arithmetic) models: list[str] # list of models to show as sub-columns col_name: str # display name in leaderboard metrics: list[str] # metrics to store (average_score) class TasksMib_Causalgraph(Enum): task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"]) task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"]) task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"]) task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"]) @classmethod def get_all_tasks(cls): """Returns a list of all task benchmarks""" return [task.value.benchmark for task in cls] @classmethod def get_all_models(cls): """Returns a list of all unique models across all tasks""" models = set() for task in cls: models.update(task.value.models) return sorted(list(models)) NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """