Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
7086c8a
1
Parent(s):
3a309c3
formatting/filtering for causal variable
Browse files
app.py
CHANGED
|
@@ -38,7 +38,7 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leader
|
|
| 38 |
from src.submission.submit import add_new_eval
|
| 39 |
|
| 40 |
|
| 41 |
-
from src.about import TasksMib_Subgraph
|
| 42 |
|
| 43 |
# class SmartSelectColumns(SelectColumns):
|
| 44 |
# """
|
|
@@ -389,21 +389,47 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
|
|
| 389 |
# print("Debugging column issues:")
|
| 390 |
# print("\nActual DataFrame columns:")
|
| 391 |
# print(dataframe.columns.tolist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
# Create only necessary columns
|
| 394 |
return Leaderboard(
|
| 395 |
-
value=
|
| 396 |
datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
|
| 397 |
-
select_columns=SelectColumns(
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
),
|
| 402 |
search_columns=["Method"],
|
| 403 |
-
hide_columns=[],
|
| 404 |
bool_checkboxgroup_label="Hide models",
|
| 405 |
interactive=False,
|
| 406 |
-
)
|
| 407 |
|
| 408 |
|
| 409 |
def init_leaderboard(dataframe, track):
|
|
@@ -497,7 +523,7 @@ with demo:
|
|
| 497 |
|
| 498 |
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 499 |
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
| 500 |
-
with gr.TabItem("
|
| 501 |
with gr.Tabs() as subgraph_tabs:
|
| 502 |
with gr.TabItem("F+", id=0):
|
| 503 |
# Add description for filters
|
|
@@ -545,20 +571,36 @@ with demo:
|
|
| 545 |
print(f"Leaderboard is {leaderboard}")
|
| 546 |
|
| 547 |
# Then modify the Causal Graph tab section
|
| 548 |
-
with gr.TabItem("Causal
|
| 549 |
with gr.Tabs() as causalgraph_tabs:
|
| 550 |
with gr.TabItem("Detailed View", id=0):
|
| 551 |
-
leaderboard_detailed = init_leaderboard_mib_causalgraph(
|
| 552 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
|
| 553 |
"Causal Graph"
|
| 554 |
)
|
| 555 |
with gr.TabItem("Aggregated View", id=1):
|
| 556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED,
|
| 558 |
"Causal Graph"
|
| 559 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
with gr.TabItem("Intervention Averaged", id=2):
|
| 561 |
-
leaderboard_averaged = init_leaderboard_mib_causalgraph(
|
| 562 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
| 563 |
"Causal Graph"
|
| 564 |
)
|
|
|
|
| 38 |
from src.submission.submit import add_new_eval
|
| 39 |
|
| 40 |
|
| 41 |
+
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
| 42 |
|
| 43 |
# class SmartSelectColumns(SelectColumns):
|
| 44 |
# """
|
|
|
|
| 389 |
# print("Debugging column issues:")
|
| 390 |
# print("\nActual DataFrame columns:")
|
| 391 |
# print(dataframe.columns.tolist())
|
| 392 |
+
|
| 393 |
+
model_name_mapping = {
|
| 394 |
+
"Qwen2ForCausalLM": "Qwen-2.5",
|
| 395 |
+
"GPT2ForCausalLM": "GPT-2",
|
| 396 |
+
"Gemma2ForCausalLM": "Gemma-2",
|
| 397 |
+
"LlamaForCausalLM": "Llama-3.1"
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
benchmark_mapping = {
|
| 401 |
+
"IOI": "IOI",
|
| 402 |
+
"MCQA": "MCQA",
|
| 403 |
+
"arithmetic_addition": "Arithmetic (+)",
|
| 404 |
+
"arithmetic_subtraction": "Arithmetic (-)",
|
| 405 |
+
"arc_easy": "ARC (Easy)",
|
| 406 |
+
"arc_challenge": "ARC (Challenge)"
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
display_mapping = {}
|
| 410 |
+
for task in TasksMib_Causalgraph:
|
| 411 |
+
for model in task.value.models:
|
| 412 |
+
field_name = f"{task.value.col_name}_{model}"
|
| 413 |
+
display_name = f"{benchmark_mapping[task.value.col_name]} - {model_name_mapping[model]}"
|
| 414 |
+
display_mapping[field_name] = display_name
|
| 415 |
+
|
| 416 |
+
# print(dataframe)
|
| 417 |
+
renamed_df = dataframe.rename(columns=display_mapping)
|
| 418 |
|
| 419 |
# Create only necessary columns
|
| 420 |
return Leaderboard(
|
| 421 |
+
value=renamed_df,
|
| 422 |
datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
|
| 423 |
+
# select_columns=SelectColumns(
|
| 424 |
+
# default_selection=["Method"], # Start with just Method column
|
| 425 |
+
# cant_deselect=["Method"], # Method column should always be visible
|
| 426 |
+
# label="Select Columns to Display:",
|
| 427 |
+
# ),
|
| 428 |
search_columns=["Method"],
|
| 429 |
+
hide_columns=["eval_name"],
|
| 430 |
bool_checkboxgroup_label="Hide models",
|
| 431 |
interactive=False,
|
| 432 |
+
), renamed_df
|
| 433 |
|
| 434 |
|
| 435 |
def init_leaderboard(dataframe, track):
|
|
|
|
| 523 |
|
| 524 |
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 525 |
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
| 526 |
+
with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
|
| 527 |
with gr.Tabs() as subgraph_tabs:
|
| 528 |
with gr.TabItem("F+", id=0):
|
| 529 |
# Add description for filters
|
|
|
|
| 571 |
print(f"Leaderboard is {leaderboard}")
|
| 572 |
|
| 573 |
# Then modify the Causal Graph tab section
|
| 574 |
+
with gr.TabItem("Causal Variable Localization", elem_id="causalgraph", id=1):
|
| 575 |
with gr.Tabs() as causalgraph_tabs:
|
| 576 |
with gr.TabItem("Detailed View", id=0):
|
| 577 |
+
leaderboard_detailed, data = init_leaderboard_mib_causalgraph(
|
| 578 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED,
|
| 579 |
"Causal Graph"
|
| 580 |
)
|
| 581 |
with gr.TabItem("Aggregated View", id=1):
|
| 582 |
+
gr.Markdown("""
|
| 583 |
+
### Filtering Options
|
| 584 |
+
Use the dropdown menus below to filter results by specific tasks or models.
|
| 585 |
+
You can combine filters to see specific task-model combinations.
|
| 586 |
+
""")
|
| 587 |
+
substring_checkbox = gr.CheckboxGroup(
|
| 588 |
+
choices=PRESET_SUBSTRINGS,
|
| 589 |
+
label="Filter results:",
|
| 590 |
+
value=PRESET_SUBSTRINGS, # Default to all substrings selected
|
| 591 |
+
)
|
| 592 |
+
leaderboard_aggregated, data = init_leaderboard_mib_causalgraph(
|
| 593 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED,
|
| 594 |
"Causal Graph"
|
| 595 |
)
|
| 596 |
+
original_leaderboard = gr.State(value=data)
|
| 597 |
+
substring_checkbox.change(
|
| 598 |
+
fn=update_leaderboard,
|
| 599 |
+
inputs=[original_leaderboard, substring_checkbox],
|
| 600 |
+
outputs=leaderboard_aggregated
|
| 601 |
+
)
|
| 602 |
with gr.TabItem("Intervention Averaged", id=2):
|
| 603 |
+
leaderboard_averaged, data = init_leaderboard_mib_causalgraph(
|
| 604 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED,
|
| 605 |
"Causal Graph"
|
| 606 |
)
|
src/__pycache__/about.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/about.cpython-310.pyc and b/src/__pycache__/about.cpython-310.pyc differ
|
|
|
src/__pycache__/populate.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
|
|
|
src/about.py
CHANGED
|
@@ -98,7 +98,10 @@ class TaskMIB_Causalgraph:
|
|
| 98 |
metrics: list[str] # metrics to store (average_score)
|
| 99 |
|
| 100 |
class TasksMib_Causalgraph(Enum):
|
| 101 |
-
task0 = TaskMIB_Subgraph("
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
@classmethod
|
| 104 |
def get_all_tasks(cls):
|
|
|
|
| 98 |
metrics: list[str] # metrics to store (average_score)
|
| 99 |
|
| 100 |
class TasksMib_Causalgraph(Enum):
|
| 101 |
+
task0 = TaskMIB_Subgraph("ioi", ["GPT2ForCausalLM"], "IOI", ["average_score"])
|
| 102 |
+
task1 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
|
| 103 |
+
task2 = TaskMIB_Subgraph("arithmetic_addition", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arithmetic_addition", ["average_score"])
|
| 104 |
+
task3 = TaskMIB_Subgraph("arc_easy", ["Gemma2ForCausalLM", "LlamaForCausalLM"], "arc_easy", ["average_score"])
|
| 105 |
|
| 106 |
@classmethod
|
| 107 |
def get_all_tasks(cls):
|
src/display/__pycache__/utils.cpython-310.pyc
CHANGED
|
Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ
|
|
|
src/leaderboard/__pycache__/read_evals.cpython-310.pyc
CHANGED
|
Binary files a/src/leaderboard/__pycache__/read_evals.cpython-310.pyc and b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc differ
|
|
|