Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
4780a48
1
Parent(s):
1d8e193
caulsal-track debug
Browse files- app.py +1 -277
- caulsal_metric.py +5 -0
- src/about.py +38 -70
- src/display/utils.py +23 -200
- src/leaderboard/read_evals.py +141 -250
- src/populate.py +21 -140
app.py
CHANGED
|
@@ -23,10 +23,8 @@ from src.display.utils import (
|
|
| 23 |
BENCHMARK_COLS,
|
| 24 |
BENCHMARK_COLS_MULTIMODAL,
|
| 25 |
BENCHMARK_COLS_MIB_SUBGRAPH,
|
| 26 |
-
BENCHMARK_COLS_MIB_CAUSALGRAPH,
|
| 27 |
COLS,
|
| 28 |
COLS_MIB_SUBGRAPH,
|
| 29 |
-
COLS_MIB_CAUSALGRAPH,
|
| 30 |
COLS_MULTIMODAL,
|
| 31 |
EVAL_COLS,
|
| 32 |
EVAL_TYPES,
|
|
@@ -281,9 +279,7 @@ LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_M
|
|
| 281 |
# In app.py, modify the LEADERBOARD initialization
|
| 282 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
| 283 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
|
| 284 |
-
EVAL_REQUESTS_PATH
|
| 285 |
-
COLS_MIB_CAUSALGRAPH,
|
| 286 |
-
BENCHMARK_COLS_MIB_CAUSALGRAPH
|
| 287 |
)
|
| 288 |
|
| 289 |
|
|
@@ -300,95 +296,6 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
|
|
| 300 |
|
| 301 |
|
| 302 |
|
| 303 |
-
# def init_leaderboard_mib_subgraph(dataframe, track):
|
| 304 |
-
# # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
| 305 |
-
|
| 306 |
-
# if dataframe is None or dataframe.empty:
|
| 307 |
-
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 308 |
-
|
| 309 |
-
# # filter for correct track
|
| 310 |
-
# # dataframe = dataframe.loc[dataframe["Track"] == track]
|
| 311 |
-
|
| 312 |
-
# # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
|
| 313 |
-
|
| 314 |
-
# return Leaderboard(
|
| 315 |
-
# value=dataframe,
|
| 316 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
| 317 |
-
# select_columns=SelectColumns(
|
| 318 |
-
# default_selection=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.displayed_by_default],
|
| 319 |
-
# cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.never_hidden],
|
| 320 |
-
# label="Select Columns to Display:",
|
| 321 |
-
# ),
|
| 322 |
-
# search_columns=["Method"], # Changed from AutoEvalColumn_mib_subgraph.model.name to "Method"
|
| 323 |
-
# hide_columns=[c.name for c in fields(AutoEvalColumn_mib_subgraph) if c.hidden],
|
| 324 |
-
# bool_checkboxgroup_label="Hide models",
|
| 325 |
-
# interactive=False,
|
| 326 |
-
# )
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
# def init_leaderboard_mib_subgraph(dataframe, track):
|
| 333 |
-
# """Initialize the subgraph leaderboard with grouped column selection by benchmark."""
|
| 334 |
-
# if dataframe is None or dataframe.empty:
|
| 335 |
-
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 336 |
-
|
| 337 |
-
# print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
| 338 |
-
|
| 339 |
-
# # Create groups of columns by benchmark
|
| 340 |
-
# benchmark_groups = []
|
| 341 |
-
|
| 342 |
-
# # For each benchmark in our TasksMib_Subgraph enum...
|
| 343 |
-
# for task in TasksMib_Subgraph:
|
| 344 |
-
# benchmark = task.value.benchmark
|
| 345 |
-
# # Get all valid columns for this benchmark's models
|
| 346 |
-
# benchmark_cols = [
|
| 347 |
-
# f"{benchmark}_{model}"
|
| 348 |
-
# for model in task.value.models
|
| 349 |
-
# if f"{benchmark}_{model}" in dataframe.columns
|
| 350 |
-
# ]
|
| 351 |
-
# if benchmark_cols: # Only add if we have valid columns
|
| 352 |
-
# benchmark_groups.append(benchmark_cols)
|
| 353 |
-
# print(f"\nBenchmark group for {benchmark}:", benchmark_cols)
|
| 354 |
-
|
| 355 |
-
# # Create model groups as well
|
| 356 |
-
# model_groups = []
|
| 357 |
-
# all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
|
| 358 |
-
|
| 359 |
-
# # For each unique model...
|
| 360 |
-
# for model in all_models:
|
| 361 |
-
# # Get all valid columns for this model across benchmarks
|
| 362 |
-
# model_cols = [
|
| 363 |
-
# f"{task.value.benchmark}_{model}"
|
| 364 |
-
# for task in TasksMib_Subgraph
|
| 365 |
-
# if model in task.value.models
|
| 366 |
-
# and f"{task.value.benchmark}_{model}" in dataframe.columns
|
| 367 |
-
# ]
|
| 368 |
-
# if model_cols: # Only add if we have valid columns
|
| 369 |
-
# model_groups.append(model_cols)
|
| 370 |
-
# print(f"\nModel group for {model}:", model_cols)
|
| 371 |
-
|
| 372 |
-
# # Combine all groups
|
| 373 |
-
# all_groups = benchmark_groups + model_groups
|
| 374 |
-
|
| 375 |
-
# # Flatten groups for default selection (show everything initially)
|
| 376 |
-
# all_columns = [col for group in all_groups for col in group]
|
| 377 |
-
# print("\nAll available columns:", all_columns)
|
| 378 |
-
|
| 379 |
-
# return Leaderboard(
|
| 380 |
-
# value=dataframe,
|
| 381 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
| 382 |
-
# select_columns=SelectColumns(
|
| 383 |
-
# default_selection=all_columns, # Show all columns initially
|
| 384 |
-
# label="Select Results:"
|
| 385 |
-
# ),
|
| 386 |
-
# search_columns=["Method"],
|
| 387 |
-
# hide_columns=[],
|
| 388 |
-
# interactive=False,
|
| 389 |
-
# )
|
| 390 |
-
|
| 391 |
-
|
| 392 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
| 393 |
"""Initialize the subgraph leaderboard with display names for better readability."""
|
| 394 |
if dataframe is None or dataframe.empty:
|
|
@@ -478,189 +385,6 @@ def init_leaderboard_mib_subgraph(dataframe, track):
|
|
| 478 |
|
| 479 |
|
| 480 |
|
| 481 |
-
# # Complete column groups for both benchmarks and models
|
| 482 |
-
# # Define keywords for filtering
|
| 483 |
-
# benchmark_keywords = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
|
| 484 |
-
# model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
|
| 485 |
-
|
| 486 |
-
# # Optional: Define display names
|
| 487 |
-
# mappings = {
|
| 488 |
-
# "ioi_llama3": "IOI (LLaMA-3)",
|
| 489 |
-
# "ioi_qwen2_5": "IOI (Qwen-2.5)",
|
| 490 |
-
# "ioi_gpt2": "IOI (GPT-2)",
|
| 491 |
-
# "ioi_gemma2": "IOI (Gemma-2)",
|
| 492 |
-
# "mcqa_llama3": "MCQA (LLaMA-3)",
|
| 493 |
-
# "mcqa_qwen2_5": "MCQA (Qwen-2.5)",
|
| 494 |
-
# "mcqa_gemma2": "MCQA (Gemma-2)",
|
| 495 |
-
# "arithmetic_addition_llama3": "Arithmetic Addition (LLaMA-3)",
|
| 496 |
-
# "arithmetic_subtraction_llama3": "Arithmetic Subtraction (LLaMA-3)",
|
| 497 |
-
# "arc_easy_llama3": "ARC Easy (LLaMA-3)",
|
| 498 |
-
# "arc_easy_gemma2": "ARC Easy (Gemma-2)",
|
| 499 |
-
# "arc_challenge_llama3": "ARC Challenge (LLaMA-3)",
|
| 500 |
-
# "eval_name": "Evaluation Name",
|
| 501 |
-
# "Method": "Method",
|
| 502 |
-
# "Average": "Average Score"
|
| 503 |
-
# }
|
| 504 |
-
# # mappings = {}
|
| 505 |
-
|
| 506 |
-
# # Create SmartSelectColumns instance
|
| 507 |
-
# smart_columns = SmartSelectColumns(
|
| 508 |
-
# benchmark_keywords=benchmark_keywords,
|
| 509 |
-
# model_keywords=model_keywords,
|
| 510 |
-
# column_mapping=mappings,
|
| 511 |
-
# initial_selected=["Method", "Average"]
|
| 512 |
-
# )
|
| 513 |
-
|
| 514 |
-
# print("\nDebugging DataFrame columns:", renamed_df.columns.tolist())
|
| 515 |
-
|
| 516 |
-
# # Create Leaderboard
|
| 517 |
-
# leaderboard = Leaderboard(
|
| 518 |
-
# value=renamed_df,
|
| 519 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
| 520 |
-
# select_columns=smart_columns,
|
| 521 |
-
# search_columns=["Method"],
|
| 522 |
-
# hide_columns=[],
|
| 523 |
-
# interactive=False
|
| 524 |
-
# )
|
| 525 |
-
# print(f"Successfully created leaderboard.")
|
| 526 |
-
# return leaderboard
|
| 527 |
-
|
| 528 |
-
# print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
| 529 |
-
|
| 530 |
-
# # Define simple keywords for filtering
|
| 531 |
-
# benchmark_keywords = ["ioi", "mcqa", "arithmetic", "arc"]
|
| 532 |
-
# model_keywords = ["qwen2_5", "gpt2", "gemma2", "llama3"]
|
| 533 |
-
|
| 534 |
-
# # Create SmartSelectColumns instance with exact same parameters as working version
|
| 535 |
-
# smart_columns = SmartSelectColumns(
|
| 536 |
-
# benchmark_keywords=benchmark_keywords,
|
| 537 |
-
# model_keywords=model_keywords,
|
| 538 |
-
# initial_selected=["Method", "Average"],
|
| 539 |
-
# allow=True,
|
| 540 |
-
# label=None,
|
| 541 |
-
# show_label=True,
|
| 542 |
-
# info=None
|
| 543 |
-
# )
|
| 544 |
-
|
| 545 |
-
# try:
|
| 546 |
-
# print("\nCreating leaderboard...")
|
| 547 |
-
# # Get groups before creating leaderboard
|
| 548 |
-
# smart_columns.get_filtered_groups(dataframe.columns)
|
| 549 |
-
|
| 550 |
-
# leaderboard = Leaderboard(
|
| 551 |
-
# value=dataframe,
|
| 552 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
| 553 |
-
# select_columns=smart_columns,
|
| 554 |
-
# search_columns=["Method"],
|
| 555 |
-
# hide_columns=[],
|
| 556 |
-
# interactive=False
|
| 557 |
-
# )
|
| 558 |
-
# print("Leaderboard created successfully")
|
| 559 |
-
# return leaderboard
|
| 560 |
-
|
| 561 |
-
# except Exception as e:
|
| 562 |
-
# print("Error creating leaderboard:", str(e))
|
| 563 |
-
# raise
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
# def init_leaderboard_mib_subgraph(dataframe, track):
|
| 571 |
-
# """Initialize the subgraph leaderboard with group-based column selection."""
|
| 572 |
-
# if dataframe is None or dataframe.empty:
|
| 573 |
-
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 574 |
-
|
| 575 |
-
# print("\nDebugging DataFrame columns:", dataframe.columns.tolist())
|
| 576 |
-
|
| 577 |
-
# # Create selection mapping for benchmark groups
|
| 578 |
-
# selection_mapping = {}
|
| 579 |
-
|
| 580 |
-
# # Create benchmark groups with descriptive names
|
| 581 |
-
# for task in TasksMib_Subgraph:
|
| 582 |
-
# benchmark = task.value.benchmark
|
| 583 |
-
# # Get all columns for this benchmark's models
|
| 584 |
-
# benchmark_cols = [
|
| 585 |
-
# f"{benchmark}_{model}"
|
| 586 |
-
# for model in task.value.models
|
| 587 |
-
# if f"{benchmark}_{model}" in dataframe.columns
|
| 588 |
-
# ]
|
| 589 |
-
# if benchmark_cols:
|
| 590 |
-
# # Use a descriptive group name as the key
|
| 591 |
-
# group_name = f"Benchmark: {benchmark.upper()}"
|
| 592 |
-
# selection_mapping[group_name] = benchmark_cols
|
| 593 |
-
# print(f"\n{group_name} maps to:", benchmark_cols)
|
| 594 |
-
|
| 595 |
-
# # Create model groups with descriptive names
|
| 596 |
-
# all_models = list(set(model for task in TasksMib_Subgraph for model in task.value.models))
|
| 597 |
-
# for model in all_models:
|
| 598 |
-
# # Get all columns for this model across benchmarks
|
| 599 |
-
# model_cols = [
|
| 600 |
-
# f"{task.value.benchmark}_{model}"
|
| 601 |
-
# for task in TasksMib_Subgraph
|
| 602 |
-
# if model in task.value.models
|
| 603 |
-
# and f"{task.value.benchmark}_{model}" in dataframe.columns
|
| 604 |
-
# ]
|
| 605 |
-
# if model_cols:
|
| 606 |
-
# # Use a descriptive group name as the key
|
| 607 |
-
# group_name = f"Model: {model}"
|
| 608 |
-
# selection_mapping[group_name] = model_cols
|
| 609 |
-
# print(f"\n{group_name} maps to:", model_cols)
|
| 610 |
-
|
| 611 |
-
# # The selection options are the group names
|
| 612 |
-
# selection_options = list(selection_mapping.keys())
|
| 613 |
-
# print("\nSelection options:", selection_options)
|
| 614 |
-
|
| 615 |
-
# return Leaderboard(
|
| 616 |
-
# value=dataframe,
|
| 617 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_subgraph)],
|
| 618 |
-
# select_columns=SelectColumns(
|
| 619 |
-
# default_selection=selection_options, # Show all groups by default
|
| 620 |
-
# label="Select Benchmark or Model Groups:"
|
| 621 |
-
# ),
|
| 622 |
-
# search_columns=["Method"],
|
| 623 |
-
# hide_columns=[],
|
| 624 |
-
# interactive=False,
|
| 625 |
-
# )
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
# def init_leaderboard_mib_causalgraph(dataframe, track):
|
| 635 |
-
# # print("Debugging column issues:")
|
| 636 |
-
# # print("\nActual DataFrame columns:")
|
| 637 |
-
# # print(dataframe.columns.tolist())
|
| 638 |
-
|
| 639 |
-
# # print("\nExpected columns for Leaderboard:")
|
| 640 |
-
# expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
|
| 641 |
-
# # print(expected_cols)
|
| 642 |
-
|
| 643 |
-
# # print("\nMissing columns:")
|
| 644 |
-
# missing_cols = [col for col in expected_cols if col not in dataframe.columns]
|
| 645 |
-
# # print(missing_cols)
|
| 646 |
-
|
| 647 |
-
# # print("\nSample of DataFrame content:")
|
| 648 |
-
# # print(dataframe.head().to_string())
|
| 649 |
-
|
| 650 |
-
# return Leaderboard(
|
| 651 |
-
# value=dataframe,
|
| 652 |
-
# datatype=[c.type for c in fields(AutoEvalColumn_mib_causalgraph)],
|
| 653 |
-
# select_columns=SelectColumns(
|
| 654 |
-
# default_selection=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.displayed_by_default],
|
| 655 |
-
# cant_deselect=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.never_hidden],
|
| 656 |
-
# label="Select Columns to Display:",
|
| 657 |
-
# ),
|
| 658 |
-
# search_columns=["Method"],
|
| 659 |
-
# hide_columns=[c.name for c in fields(AutoEvalColumn_mib_causalgraph) if c.hidden],
|
| 660 |
-
# bool_checkboxgroup_label="Hide models",
|
| 661 |
-
# interactive=False,
|
| 662 |
-
# )
|
| 663 |
-
|
| 664 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
| 665 |
# print("Debugging column issues:")
|
| 666 |
# print("\nActual DataFrame columns:")
|
|
|
|
| 23 |
BENCHMARK_COLS,
|
| 24 |
BENCHMARK_COLS_MULTIMODAL,
|
| 25 |
BENCHMARK_COLS_MIB_SUBGRAPH,
|
|
|
|
| 26 |
COLS,
|
| 27 |
COLS_MIB_SUBGRAPH,
|
|
|
|
| 28 |
COLS_MULTIMODAL,
|
| 29 |
EVAL_COLS,
|
| 30 |
EVAL_TYPES,
|
|
|
|
| 279 |
# In app.py, modify the LEADERBOARD initialization
|
| 280 |
LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGATED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AVERAGED = get_leaderboard_df_mib_causalgraph(
|
| 281 |
EVAL_RESULTS_MIB_CAUSALGRAPH_PATH,
|
| 282 |
+
EVAL_REQUESTS_PATH
|
|
|
|
|
|
|
| 283 |
)
|
| 284 |
|
| 285 |
|
|
|
|
| 296 |
|
| 297 |
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
| 300 |
"""Initialize the subgraph leaderboard with display names for better readability."""
|
| 301 |
if dataframe is None or dataframe.empty:
|
|
|
|
| 385 |
|
| 386 |
|
| 387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
def init_leaderboard_mib_causalgraph(dataframe, track):
|
| 389 |
# print("Debugging column issues:")
|
| 390 |
# print("\nActual DataFrame columns:")
|
caulsal_metric.py
CHANGED
|
@@ -135,6 +135,11 @@ def create_summary_dataframe(json_files: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
| 135 |
|
| 136 |
return df
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
| 139 |
"""
|
| 140 |
Aggregates rows with the same base method name by taking the max value for each column.
|
|
|
|
| 135 |
|
| 136 |
return df
|
| 137 |
|
| 138 |
+
|
| 139 |
+
# averaged_cf = average_counterfactuals(json_files)
|
| 140 |
+
# layer_averaged = find_layer_averages(averaged_cf)
|
| 141 |
+
# detailed_df = create_summary_dataframe(layer_averaged)
|
| 142 |
+
|
| 143 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
| 144 |
"""
|
| 145 |
Aggregates rows with the same base method name by taking the max value for each column.
|
src/about.py
CHANGED
|
@@ -61,33 +61,27 @@ class TasksMib_Subgraph(Enum):
|
|
| 61 |
return sorted(list(models))
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
| 64 |
# @dataclass
|
| 65 |
# class TaskMIB_Causalgraph:
|
| 66 |
-
# benchmark: str
|
| 67 |
-
# models: list[str]
|
| 68 |
-
# layers: list[str]
|
| 69 |
-
# col_name: str
|
| 70 |
-
# interventions: list[str]
|
| 71 |
-
# counterfactuals: list[str]
|
| 72 |
-
# metrics: list[str]
|
| 73 |
|
| 74 |
-
# class TasksMib_Causalgraph(Enum):
|
| 75 |
-
# task0 = TaskMIB_Causalgraph(
|
| 76 |
-
# "MCQA",
|
| 77 |
-
# ["LlamaForCausalLM", "Qwen2ForCausalLM", "Gemma2ForCausalLM"], # Updated model list
|
| 78 |
-
# [str(i) for i in range(32)], # 0-31 layers
|
| 79 |
-
# "mcqa",
|
| 80 |
-
# ["output_token", "output_location"],
|
| 81 |
-
# ["symbol_counterfactual", "randomLetter_counterfactual",
|
| 82 |
-
# "answerPosition_counterfactual", "answerPosition_symbol_counterfactual"],
|
| 83 |
-
# ["score"]
|
| 84 |
-
# )
|
| 85 |
|
| 86 |
# class TasksMib_Causalgraph(Enum):
|
| 87 |
-
# task0 = TaskMIB_Causalgraph(
|
| 88 |
-
# "
|
| 89 |
-
#
|
| 90 |
-
#
|
|
|
|
|
|
|
|
|
|
| 91 |
# "mcqa",
|
| 92 |
# ["output_token", "output_location"],
|
| 93 |
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
|
@@ -95,58 +89,32 @@ class TasksMib_Subgraph(Enum):
|
|
| 95 |
# ["score"]
|
| 96 |
# )
|
| 97 |
|
| 98 |
-
# class TasksMib_Causalgraph(Enum):
|
| 99 |
-
# task0 = TaskMIB_Causalgraph(
|
| 100 |
-
# "MCQA",
|
| 101 |
-
# ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"], # Use lowercase consistently
|
| 102 |
-
# [str(i) for i in range(32)],
|
| 103 |
-
# "mcqa",
|
| 104 |
-
# ["output_token", "output_location"],
|
| 105 |
-
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
| 106 |
-
# "answerPosition_randomLetter_counterfactual"],
|
| 107 |
-
# ["score"]
|
| 108 |
-
# )
|
| 109 |
|
| 110 |
-
@dataclass
|
| 111 |
class TaskMIB_Causalgraph:
|
| 112 |
-
benchmark: str
|
| 113 |
-
models: list[str]
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
interventions: list[str]
|
| 117 |
-
counterfactuals: list[str]
|
| 118 |
-
metrics: list[str]
|
| 119 |
|
| 120 |
-
# class TasksMib_Causalgraph(Enum):
|
| 121 |
-
# task0 = TaskMIB_Causalgraph(
|
| 122 |
-
# "MCQA",
|
| 123 |
-
# ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"],
|
| 124 |
-
# {
|
| 125 |
-
# "Qwen2ForCausalLM": [str(i) for i in range(24)], # 0-23
|
| 126 |
-
# "Gemma2ForCausalLM": [str(i) for i in range(26)], # 0-25
|
| 127 |
-
# "LlamaForCausalLM": [str(i) for i in range(32)] # 0-31
|
| 128 |
-
# },
|
| 129 |
-
# "mcqa",
|
| 130 |
-
# ["output_token", "output_location"],
|
| 131 |
-
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
| 132 |
-
# "answerPosition_randomLetter_counterfactual"],
|
| 133 |
-
# ["score"]
|
| 134 |
-
# )
|
| 135 |
class TasksMib_Causalgraph(Enum):
|
| 136 |
-
task0 =
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
|
| 61 |
return sorted(list(models))
|
| 62 |
|
| 63 |
|
| 64 |
+
|
| 65 |
+
|
| 66 |
# @dataclass
|
| 67 |
# class TaskMIB_Causalgraph:
|
| 68 |
+
# benchmark: str
|
| 69 |
+
# models: list[str]
|
| 70 |
+
# layers: dict[str, list[str]] # Different layers for each model
|
| 71 |
+
# col_name: str
|
| 72 |
+
# interventions: list[str]
|
| 73 |
+
# counterfactuals: list[str]
|
| 74 |
+
# metrics: list[str]
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
# class TasksMib_Causalgraph(Enum):
|
| 78 |
+
# task0 = TaskMIB_Causalgraph("MCQA",
|
| 79 |
+
# ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"],
|
| 80 |
+
# {
|
| 81 |
+
# "qwen2forcausallm": [str(i) for i in range(24)], # 0-23
|
| 82 |
+
# "gemma2forcausallm": [str(i) for i in range(26)], # 0-25
|
| 83 |
+
# "llamaforcausallm": [str(i) for i in range(32)] # 0-31
|
| 84 |
+
# },
|
| 85 |
# "mcqa",
|
| 86 |
# ["output_token", "output_location"],
|
| 87 |
# ["randomLetter_counterfactual", "answerPosition_counterfactual",
|
|
|
|
| 89 |
# ["score"]
|
| 90 |
# )
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
@dataclass
|
| 94 |
class TaskMIB_Causalgraph:
|
| 95 |
+
benchmark: str # task name in json (ioi/arithmetic)
|
| 96 |
+
models: list[str] # list of models to show as sub-columns
|
| 97 |
+
col_name: str # display name in leaderboard
|
| 98 |
+
metrics: list[str] # metrics to store (average_score)
|
|
|
|
|
|
|
|
|
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
class TasksMib_Causalgraph(Enum):
|
| 101 |
+
task0 = TaskMIB_Subgraph("mcqa", ["Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"], "MCQA", ["average_score"])
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def get_all_tasks(cls):
|
| 105 |
+
"""Returns a list of all task benchmarks"""
|
| 106 |
+
return [task.value.benchmark for task in cls]
|
| 107 |
+
|
| 108 |
+
@classmethod
|
| 109 |
+
def get_all_models(cls):
|
| 110 |
+
"""Returns a list of all unique models across all tasks"""
|
| 111 |
+
models = set()
|
| 112 |
+
for task in cls:
|
| 113 |
+
models.update(task.value.models)
|
| 114 |
+
return sorted(list(models))
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
|
| 118 |
|
| 119 |
|
| 120 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/display/utils.py
CHANGED
|
@@ -58,64 +58,6 @@ AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_
|
|
| 58 |
|
| 59 |
|
| 60 |
|
| 61 |
-
##############################################################################################################
|
| 62 |
-
# Version 1
|
| 63 |
-
|
| 64 |
-
# auto_eval_column_dict_mib_subgraph = []
|
| 65 |
-
|
| 66 |
-
# # Method name column
|
| 67 |
-
# auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 68 |
-
|
| 69 |
-
# # For each task and model combination
|
| 70 |
-
# for task in TasksMib_Subgraph:
|
| 71 |
-
# for model in task.value.models:
|
| 72 |
-
# col_name = f"{task.value.benchmark}_{model}" # ioi_gpt2, mcqa_qwen2.5, etc.
|
| 73 |
-
# auto_eval_column_dict_mib_subgraph.append([
|
| 74 |
-
# col_name,
|
| 75 |
-
# ColumnContent,
|
| 76 |
-
# ColumnContent(col_name, "number", True)
|
| 77 |
-
# ])
|
| 78 |
-
|
| 79 |
-
# # Average column
|
| 80 |
-
# auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
# ##############################################################################################################
|
| 84 |
-
# # Version 2
|
| 85 |
-
# auto_eval_column_dict_mib_subgraph = []
|
| 86 |
-
|
| 87 |
-
# # Method name column
|
| 88 |
-
# auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 89 |
-
|
| 90 |
-
# # Add task filter column
|
| 91 |
-
# task_values = list(set(task.value.benchmark for task in TasksMib_Subgraph))
|
| 92 |
-
# auto_eval_column_dict_mib_subgraph.append(
|
| 93 |
-
# ["task_filter", ColumnContent, ColumnContent("Task", "str", True, never_hidden=True)]
|
| 94 |
-
# )
|
| 95 |
-
|
| 96 |
-
# # Add model filter column
|
| 97 |
-
# model_values = list(set(
|
| 98 |
-
# model
|
| 99 |
-
# for task in TasksMib_Subgraph
|
| 100 |
-
# for model in task.value.models
|
| 101 |
-
# ))
|
| 102 |
-
# auto_eval_column_dict_mib_subgraph.append(
|
| 103 |
-
# ["model_filter", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)]
|
| 104 |
-
# )
|
| 105 |
-
|
| 106 |
-
# # For each task and model combination
|
| 107 |
-
# for task in TasksMib_Subgraph:
|
| 108 |
-
# for model in task.value.models:
|
| 109 |
-
# col_name = f"{task.value.benchmark}_{model}"
|
| 110 |
-
# auto_eval_column_dict_mib_subgraph.append([
|
| 111 |
-
# col_name,
|
| 112 |
-
# ColumnContent,
|
| 113 |
-
# ColumnContent(col_name, "number", True)
|
| 114 |
-
# ])
|
| 115 |
-
|
| 116 |
-
# # Average column
|
| 117 |
-
# auto_eval_column_dict_mib_subgraph.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
| 118 |
-
|
| 119 |
|
| 120 |
##############################################################################################################
|
| 121 |
# Version 3
|
|
@@ -185,133 +127,6 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
| 185 |
|
| 186 |
|
| 187 |
|
| 188 |
-
|
| 189 |
-
# # Initialize the MIB causal graph columns
|
| 190 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
| 191 |
-
|
| 192 |
-
# # Method name column
|
| 193 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 194 |
-
|
| 195 |
-
# # For each model-task-intervention combination
|
| 196 |
-
# for task in TasksMib_Causalgraph:
|
| 197 |
-
# for model in task.value.models:
|
| 198 |
-
# for intervention in task.value.interventions:
|
| 199 |
-
# col_name = f"{model}_{task.value.benchmark}_{intervention}".lower()
|
| 200 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
| 201 |
-
# col_name,
|
| 202 |
-
# ColumnContent,
|
| 203 |
-
# ColumnContent(col_name, "number", True)
|
| 204 |
-
# ])
|
| 205 |
-
|
| 206 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
| 207 |
-
|
| 208 |
-
# # Method name column
|
| 209 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 210 |
-
|
| 211 |
-
# # For each model-task-intervention combination
|
| 212 |
-
# for task in TasksMib_Causalgraph:
|
| 213 |
-
# for model in task.value.models:
|
| 214 |
-
# model_name = model.lower() # Convert model name to lowercase
|
| 215 |
-
# for layer in task.value.layers:
|
| 216 |
-
# for intervention in task.value.interventions:
|
| 217 |
-
# for counterfactual in task.value.counterfactuals:
|
| 218 |
-
# # Include model name in the column name
|
| 219 |
-
# col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
|
| 220 |
-
# field_name = col_name.lower()
|
| 221 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
| 222 |
-
# field_name,
|
| 223 |
-
# ColumnContent,
|
| 224 |
-
# ColumnContent(col_name, "number", True)
|
| 225 |
-
# ])
|
| 226 |
-
|
| 227 |
-
# # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
|
| 228 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
| 229 |
-
|
| 230 |
-
# # Method name column
|
| 231 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 232 |
-
|
| 233 |
-
# # For each model-task-intervention-counterfactual combination
|
| 234 |
-
# for task in TasksMib_Causalgraph:
|
| 235 |
-
# for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names
|
| 236 |
-
# for layer in task.value.layers:
|
| 237 |
-
# for intervention in task.value.interventions:
|
| 238 |
-
# for counterfactual in task.value.counterfactuals:
|
| 239 |
-
# # Match the exact format from the data
|
| 240 |
-
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
|
| 241 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
| 242 |
-
# col_name,
|
| 243 |
-
# ColumnContent,
|
| 244 |
-
# ColumnContent(col_name, "number", True)
|
| 245 |
-
# ])
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
| 251 |
-
|
| 252 |
-
# # Method name column
|
| 253 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 254 |
-
|
| 255 |
-
# # Add eval_name column
|
| 256 |
-
# auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
|
| 257 |
-
|
| 258 |
-
# # For each model-task-intervention-counterfactual combination
|
| 259 |
-
# for task in TasksMib_Causalgraph:
|
| 260 |
-
# for model in task.value.models: # Use exact model names with correct casing
|
| 261 |
-
# model_name = model # Don't convert to lowercase
|
| 262 |
-
# for layer in task.value.layers:
|
| 263 |
-
# for intervention in task.value.interventions:
|
| 264 |
-
# for counterfactual in task.value.counterfactuals:
|
| 265 |
-
# # Match exact format from the actual data
|
| 266 |
-
# col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
|
| 267 |
-
# # Use the exact column name as both the field name and display name
|
| 268 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
| 269 |
-
# col_name,
|
| 270 |
-
# ColumnContent,
|
| 271 |
-
# ColumnContent(col_name, "number", True)
|
| 272 |
-
# ])
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
| 277 |
-
|
| 278 |
-
# # Method name column
|
| 279 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 280 |
-
# auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
|
| 281 |
-
|
| 282 |
-
# # For each model-task-intervention-counterfactual combination
|
| 283 |
-
# for task in TasksMib_Causalgraph:
|
| 284 |
-
# for model in task.value.models:
|
| 285 |
-
# for layer in task.value.layers[model]: # Use model-specific layers
|
| 286 |
-
# for intervention in task.value.interventions:
|
| 287 |
-
# for counterfactual in task.value.counterfactuals:
|
| 288 |
-
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
|
| 289 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
| 290 |
-
# col_name,
|
| 291 |
-
# ColumnContent,
|
| 292 |
-
# ColumnContent(col_name, "number", True)
|
| 293 |
-
# ])
|
| 294 |
-
|
| 295 |
-
# auto_eval_column_dict_mib_causalgraph = []
|
| 296 |
-
|
| 297 |
-
# # Method name column
|
| 298 |
-
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 299 |
-
# auto_eval_column_dict_mib_causalgraph.append(["eval_name", ColumnContent, ColumnContent("eval_name", "str", True)])
|
| 300 |
-
|
| 301 |
-
# # For each model-task-intervention-counterfactual combination
|
| 302 |
-
# for task in TasksMib_Causalgraph:
|
| 303 |
-
# for model in task.value.models: # model will already be lowercase
|
| 304 |
-
# for layer in task.value.layers[model]:
|
| 305 |
-
# for intervention in task.value.interventions:
|
| 306 |
-
# for counterfactual in task.value.counterfactuals:
|
| 307 |
-
# # Use exactly the same format as in DataFrame
|
| 308 |
-
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}"
|
| 309 |
-
# auto_eval_column_dict_mib_causalgraph.append([
|
| 310 |
-
# col_name,
|
| 311 |
-
# ColumnContent,
|
| 312 |
-
# ColumnContent(col_name, "number", True)
|
| 313 |
-
# ])
|
| 314 |
-
|
| 315 |
auto_eval_column_dict_mib_causalgraph = []
|
| 316 |
|
| 317 |
# Only include Method column as required
|
|
@@ -320,15 +135,17 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
|
|
| 320 |
# For each model-task-intervention-counterfactual combination
|
| 321 |
for task in TasksMib_Causalgraph:
|
| 322 |
for model in task.value.models: # model will be lowercase
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
| 332 |
|
| 333 |
# Create the dataclass
|
| 334 |
AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
@@ -337,14 +154,20 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
| 337 |
frozen=True
|
| 338 |
)
|
| 339 |
|
| 340 |
-
# Column selection for display
|
| 341 |
-
COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
| 342 |
|
| 343 |
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
|
| 350 |
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
##############################################################################################################
|
| 63 |
# Version 3
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
auto_eval_column_dict_mib_causalgraph = []
|
| 131 |
|
| 132 |
# Only include Method column as required
|
|
|
|
| 135 |
# For each model-task-intervention-counterfactual combination
|
| 136 |
for task in TasksMib_Causalgraph:
|
| 137 |
for model in task.value.models: # model will be lowercase
|
| 138 |
+
col_name = f"{task.value.benchmark}_{model}"
|
| 139 |
+
auto_eval_column_dict_mib_causalgraph.append([
|
| 140 |
+
col_name,
|
| 141 |
+
ColumnContent,
|
| 142 |
+
ColumnContent(col_name, "number", True)
|
| 143 |
+
])
|
| 144 |
+
|
| 145 |
+
# Add the Average column
|
| 146 |
+
auto_eval_column_dict_mib_causalgraph.append(
|
| 147 |
+
["average_score", ColumnContent, ColumnContent("Average", "number", True)]
|
| 148 |
+
)
|
| 149 |
|
| 150 |
# Create the dataclass
|
| 151 |
AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
|
|
| 154 |
frozen=True
|
| 155 |
)
|
| 156 |
|
|
|
|
|
|
|
| 157 |
|
| 158 |
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# # Column selection for display
|
| 164 |
+
# COLS_MIB_CAUSALGRAPH = [c.name for c in fields(AutoEvalColumn_mib_causalgraph) if not c.hidden]
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# BENCHMARK_COLS_MIB_CAUSALGRAPH = [f"{model}_{task.value.benchmark}_{intervention}".lower()
|
| 168 |
+
# for task in TasksMib_Causalgraph
|
| 169 |
+
# for model in task.value.models
|
| 170 |
+
# for intervention in task.value.interventions]
|
| 171 |
|
| 172 |
|
| 173 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -12,8 +12,10 @@ from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, T
|
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
from src.about import TasksMib_Subgraph
|
| 14 |
|
| 15 |
-
from typing import List, Dict
|
| 16 |
from collections import defaultdict
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
|
@@ -65,21 +67,8 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 65 |
for model_result in data.get("results", []):
|
| 66 |
model_id = model_result.get("model_id", "")
|
| 67 |
|
| 68 |
-
# if "/" in model_id:
|
| 69 |
-
# org = model_id.split("/")[0]
|
| 70 |
-
# if org == "meta-llama":
|
| 71 |
-
# model_name = "llama3"
|
| 72 |
-
# elif org == "Qwen":
|
| 73 |
-
# model_name = "qwen2_5"
|
| 74 |
-
# elif "gpt" in model_id.lower():
|
| 75 |
-
# model_name = "gpt2"
|
| 76 |
-
# elif org == "google":
|
| 77 |
-
# model_name = "gemma2"
|
| 78 |
-
# else:
|
| 79 |
-
# model_name = model_id.replace(".", "_")
|
| 80 |
model_name = model_id.replace(".", "_")
|
| 81 |
|
| 82 |
-
|
| 83 |
# Keep exact scores structure from JSON
|
| 84 |
scores = model_result.get("scores", {})
|
| 85 |
|
|
@@ -108,16 +97,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 108 |
|
| 109 |
# Initialize all possible columns with '-'
|
| 110 |
expected_models = TasksMib_Subgraph.get_all_models()
|
| 111 |
-
expected_tasks = TasksMib_Subgraph.get_all_tasks()
|
| 112 |
-
# for task in expected_tasks:
|
| 113 |
-
# for model in task.value.models:
|
| 114 |
-
# # if model == "gpt2" and task != "ioi":
|
| 115 |
-
# # continue
|
| 116 |
-
# # if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
|
| 117 |
-
# # continue
|
| 118 |
-
# # if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
|
| 119 |
-
# # continue
|
| 120 |
-
# data_dict[f"{task}_{model}"] = '-'
|
| 121 |
|
| 122 |
for task in TasksMib_Subgraph:
|
| 123 |
for model in task.value.models:
|
|
@@ -145,23 +125,6 @@ class EvalResult_MIB_SUBGRAPH:
|
|
| 145 |
data_dict[col_name] = round(score, 2)
|
| 146 |
all_scores.append(score)
|
| 147 |
|
| 148 |
-
# All entries must be present for average
|
| 149 |
-
# required_entries = [
|
| 150 |
-
# data_dict['ioi_llama3'] != '-',
|
| 151 |
-
# data_dict['ioi_qwen2_5'] != '-',
|
| 152 |
-
# data_dict['ioi_gpt2'] != '-',
|
| 153 |
-
# data_dict['ioi_gemma2'] != '-',
|
| 154 |
-
# data_dict['mcqa_llama3'] != '-',
|
| 155 |
-
# data_dict['mcqa_qwen2_5'] != '-',
|
| 156 |
-
# data_dict['mcqa_gemma2'] != '-',
|
| 157 |
-
# data_dict['arithmetic_addition_llama3'] != '-',
|
| 158 |
-
# data_dict['arithmetic_subtraction_llama3'] != '-',
|
| 159 |
-
# data_dict['arc_easy_gemma2'] != '-',
|
| 160 |
-
# data_dict['arc_easy_llama3'] != '-',
|
| 161 |
-
# data_dict['arc_challenge_llama3'] != '-'
|
| 162 |
-
# ]
|
| 163 |
-
|
| 164 |
-
# data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
|
| 165 |
data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
|
| 166 |
return data_dict
|
| 167 |
|
|
@@ -207,9 +170,63 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
| 207 |
|
| 208 |
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# @dataclass
|
| 211 |
# class EvalResult_MIB_CAUSALGRAPH:
|
| 212 |
-
# """Represents one full evaluation for a method in MIB causalgraph."""
|
| 213 |
# eval_name: str
|
| 214 |
# method_name: str
|
| 215 |
# results: Dict
|
|
@@ -222,31 +239,26 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
| 222 |
# method_name = data.get("method_name")
|
| 223 |
# results = {}
|
| 224 |
|
| 225 |
-
# #
|
| 226 |
# for model_result in data.get("results", []):
|
| 227 |
-
# model_id = model_result.get("model_id", "")
|
|
|
|
|
|
|
|
|
|
| 228 |
# task_scores = model_result.get("task_scores", {})
|
| 229 |
|
| 230 |
-
# # Process MCQA
|
| 231 |
-
# mcqa_scores = {}
|
| 232 |
# for layer_data in task_scores.get("MCQA", []):
|
| 233 |
# layer = layer_data.get("layer")
|
| 234 |
-
#
|
| 235 |
-
|
| 236 |
-
#
|
| 237 |
-
# for intervention_data in layer_scores:
|
| 238 |
-
# intervention = intervention_data["intervention"][0]
|
| 239 |
-
# counterfactual_scores = intervention_data["counterfactual_scores"]
|
| 240 |
-
|
| 241 |
-
# for cf_score in counterfactual_scores:
|
| 242 |
# counterfactual = cf_score["counterfactual"][0]
|
| 243 |
# score = cf_score["score"]
|
| 244 |
|
| 245 |
-
# # Create key
|
| 246 |
-
# key = f"
|
| 247 |
-
#
|
| 248 |
-
|
| 249 |
-
# results[model_id] = mcqa_scores
|
| 250 |
|
| 251 |
# return EvalResult_MIB_CAUSALGRAPH(
|
| 252 |
# eval_name=method_name,
|
|
@@ -254,198 +266,70 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
| 254 |
# results=results
|
| 255 |
# )
|
| 256 |
|
| 257 |
-
# def to_dict(self):
|
| 258 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
| 259 |
-
# data_dict = {
|
| 260 |
-
# "eval_name": self.eval_name,
|
| 261 |
-
# "Method": self.method_name,
|
| 262 |
-
# }
|
| 263 |
-
|
| 264 |
-
# # Process each model's results
|
| 265 |
-
# for model_id, model_results in self.results.items():
|
| 266 |
-
# for task, task_scores in model_results.items():
|
| 267 |
-
# # Calculate layer-averaged scores for each intervention
|
| 268 |
-
# intervention_scores = defaultdict(list)
|
| 269 |
-
|
| 270 |
-
# for layer_data in task_scores:
|
| 271 |
-
# for score_data in layer_data['scores']:
|
| 272 |
-
# intervention = score_data['intervention']
|
| 273 |
-
# intervention_scores[intervention].append(score_data['score'])
|
| 274 |
-
|
| 275 |
-
# # Average across layers for each intervention
|
| 276 |
-
# for intervention, scores in intervention_scores.items():
|
| 277 |
-
# col_name = f"{model_id}_{task}_{intervention}".lower()
|
| 278 |
-
# data_dict[col_name] = round(np.mean(scores), 3)
|
| 279 |
-
|
| 280 |
-
# return data_dict
|
| 281 |
-
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
|
| 291 |
-
# for model_id, task_scores in self.results.items():
|
| 292 |
-
# model_name = model_id.lower() # Lowercase for consistency
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
# layer = layer_data.get("layer")
|
| 297 |
-
# layer_scores = layer_data.get("layer_scores", [])
|
| 298 |
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
# counterfactual_scores = intervention_data["counterfactual_scores"]
|
| 303 |
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
# data_dict = {
|
| 319 |
-
# "eval_name": self.eval_name,
|
| 320 |
-
# "Method": self.method_name,
|
| 321 |
-
# }
|
| 322 |
-
|
| 323 |
-
# # Process each model's results
|
| 324 |
-
# for model_id, scores in self.results.items():
|
| 325 |
-
# model_name = model_id.lower()
|
| 326 |
-
# for task, layer_scores in scores.items():
|
| 327 |
-
# for layer_data in layer_scores:
|
| 328 |
-
# layer = layer_data.get("layer")
|
| 329 |
-
# intervention_scores = layer_data.get("scores", [])
|
| 330 |
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
# def to_dict(self):
|
| 339 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
| 340 |
-
# data_dict = {
|
| 341 |
-
# "eval_name": self.eval_name,
|
| 342 |
-
# "Method": self.method_name,
|
| 343 |
-
# }
|
| 344 |
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
# col_name = f"{model_name}_{key}"
|
| 351 |
-
# data_dict[col_name] = value
|
| 352 |
-
|
| 353 |
-
# return data_dict
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
@dataclass
|
| 363 |
-
class EvalResult_MIB_CAUSALGRAPH:
|
| 364 |
-
eval_name: str
|
| 365 |
-
method_name: str
|
| 366 |
-
results: Dict
|
| 367 |
-
|
| 368 |
-
def init_from_json_file(self, json_filepath):
|
| 369 |
-
"""Inits results from the method result file"""
|
| 370 |
-
with open(json_filepath) as fp:
|
| 371 |
-
data = json.load(fp)
|
| 372 |
-
|
| 373 |
-
method_name = data.get("method_name")
|
| 374 |
-
results = {}
|
| 375 |
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
for score_data in layer_data.get("layer_scores", []):
|
| 385 |
-
intervention = score_data["intervention"][0]
|
| 386 |
-
for cf_score in score_data["counterfactual_scores"]:
|
| 387 |
-
counterfactual = cf_score["counterfactual"][0]
|
| 388 |
-
score = cf_score["score"]
|
| 389 |
-
|
| 390 |
-
# Create key matching the expected column format
|
| 391 |
-
key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
|
| 392 |
-
results[key] = score
|
| 393 |
-
|
| 394 |
-
return EvalResult_MIB_CAUSALGRAPH(
|
| 395 |
-
eval_name=method_name,
|
| 396 |
-
method_name=method_name,
|
| 397 |
-
results=results
|
| 398 |
-
)
|
| 399 |
-
|
| 400 |
-
def to_dict(self):
|
| 401 |
-
"""Converts the Eval Result to a dict for dataframe display"""
|
| 402 |
-
data_dict = {
|
| 403 |
-
"eval_name": self.eval_name,
|
| 404 |
-
"Method": self.method_name,
|
| 405 |
-
}
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
|
| 410 |
-
|
| 411 |
-
|
| 412 |
|
| 413 |
|
| 414 |
|
| 415 |
|
| 416 |
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 421 |
-
# """Extract evaluation results for MIB causalgraph"""
|
| 422 |
-
# model_result_filepaths = []
|
| 423 |
-
|
| 424 |
-
# for root, dirnames, files in os.walk(results_path):
|
| 425 |
-
# if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 426 |
-
# continue
|
| 427 |
-
|
| 428 |
-
# try:
|
| 429 |
-
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 430 |
-
# except dateutil.parser._parser.ParserError:
|
| 431 |
-
# files = [files[-1]]
|
| 432 |
-
|
| 433 |
-
# for file in files:
|
| 434 |
-
# model_result_filepaths.append(os.path.join(root, file))
|
| 435 |
-
|
| 436 |
-
# eval_results = []
|
| 437 |
-
# for filepath in model_result_filepaths:
|
| 438 |
-
# try:
|
| 439 |
-
# eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
|
| 440 |
-
# result = eval_result.init_from_json_file(filepath)
|
| 441 |
-
# result.to_dict() # Verify conversion works
|
| 442 |
-
# eval_results.append(result)
|
| 443 |
-
# except Exception as e:
|
| 444 |
-
# print(f"Error processing {filepath}: {e}")
|
| 445 |
-
# continue
|
| 446 |
-
|
| 447 |
-
# return eval_results
|
| 448 |
-
|
| 449 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 450 |
model_result_filepaths = []
|
| 451 |
|
|
@@ -466,23 +350,30 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
|
|
| 466 |
|
| 467 |
# print(f"Found json files: {model_result_filepaths}")
|
| 468 |
|
| 469 |
-
|
| 470 |
-
|
|
|
|
|
|
|
| 471 |
try:
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
| 477 |
except Exception as e:
|
| 478 |
-
print(f"Error processing {
|
| 479 |
continue
|
| 480 |
-
|
| 481 |
-
#
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
| 486 |
|
| 487 |
|
| 488 |
|
|
|
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
from src.about import TasksMib_Subgraph
|
| 14 |
|
| 15 |
+
from typing import List, Dict, Any
|
| 16 |
from collections import defaultdict
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
|
| 20 |
|
| 21 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
|
|
|
| 67 |
for model_result in data.get("results", []):
|
| 68 |
model_id = model_result.get("model_id", "")
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
model_name = model_id.replace(".", "_")
|
| 71 |
|
|
|
|
| 72 |
# Keep exact scores structure from JSON
|
| 73 |
scores = model_result.get("scores", {})
|
| 74 |
|
|
|
|
| 97 |
|
| 98 |
# Initialize all possible columns with '-'
|
| 99 |
expected_models = TasksMib_Subgraph.get_all_models()
|
| 100 |
+
# expected_tasks = TasksMib_Subgraph.get_all_tasks()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
for task in TasksMib_Subgraph:
|
| 103 |
for model in task.value.models:
|
|
|
|
| 125 |
data_dict[col_name] = round(score, 2)
|
| 126 |
all_scores.append(score)
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
|
| 129 |
return data_dict
|
| 130 |
|
|
|
|
| 170 |
|
| 171 |
|
| 172 |
|
| 173 |
+
|
| 174 |
+
def process_single_json(json_file: Dict[str, Any], method_counter: int) -> pd.DataFrame:
|
| 175 |
+
"""
|
| 176 |
+
Process a single JSON file and convert it to a DataFrame.
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
json_file: Dictionary containing the analysis results
|
| 180 |
+
method_counter: Counter for handling duplicate method names
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
pd.DataFrame: DataFrame for single method with MODEL_TASK_INTERVENTION as columns
|
| 184 |
+
"""
|
| 185 |
+
method_name = json_file['method_name']
|
| 186 |
+
unique_method_name = f"{method_name}_{method_counter}"
|
| 187 |
+
method_scores = []
|
| 188 |
+
|
| 189 |
+
for result in json_file['results']:
|
| 190 |
+
model = result['model_id']
|
| 191 |
+
|
| 192 |
+
for task, scores in result['task_scores'].items():
|
| 193 |
+
# Process each layer's data
|
| 194 |
+
intervention_scores = defaultdict(list)
|
| 195 |
+
|
| 196 |
+
for layer_data in scores:
|
| 197 |
+
for intervention_data in layer_data['layer_scores']:
|
| 198 |
+
# Calculate average score for counterfactuals
|
| 199 |
+
avg_cf_score = np.mean([
|
| 200 |
+
cf['score']
|
| 201 |
+
for cf in intervention_data['counterfactual_scores']
|
| 202 |
+
])
|
| 203 |
+
|
| 204 |
+
if np.isnan(avg_cf_score):
|
| 205 |
+
avg_cf_score = 0.0
|
| 206 |
+
|
| 207 |
+
# Group scores by intervention
|
| 208 |
+
intervention_key = '_'.join(intervention_data['intervention'])
|
| 209 |
+
intervention_scores[intervention_key].append(avg_cf_score)
|
| 210 |
+
|
| 211 |
+
# Average across layers for each intervention
|
| 212 |
+
for intervention, layer_scores in intervention_scores.items():
|
| 213 |
+
column = f"{model}_{task}_{intervention}"
|
| 214 |
+
avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
| 215 |
+
method_scores.append((column, f"{avg_score:.3f}"))
|
| 216 |
+
|
| 217 |
+
# Sort by column names for consistency
|
| 218 |
+
method_scores.sort(key=lambda x: x[0])
|
| 219 |
+
data = {
|
| 220 |
+
unique_method_name: {
|
| 221 |
+
col: score for col, score in method_scores
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
return pd.DataFrame.from_dict(data, orient='index')
|
| 226 |
+
|
| 227 |
+
|
| 228 |
# @dataclass
|
| 229 |
# class EvalResult_MIB_CAUSALGRAPH:
|
|
|
|
| 230 |
# eval_name: str
|
| 231 |
# method_name: str
|
| 232 |
# results: Dict
|
|
|
|
| 239 |
# method_name = data.get("method_name")
|
| 240 |
# results = {}
|
| 241 |
|
| 242 |
+
# # Process each model's results
|
| 243 |
# for model_result in data.get("results", []):
|
| 244 |
+
# model_id = model_result.get("model_id", "")
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
|
| 248 |
# task_scores = model_result.get("task_scores", {})
|
| 249 |
|
| 250 |
+
# # Process MCQA scores
|
|
|
|
| 251 |
# for layer_data in task_scores.get("MCQA", []):
|
| 252 |
# layer = layer_data.get("layer")
|
| 253 |
+
# for score_data in layer_data.get("layer_scores", []):
|
| 254 |
+
# intervention = score_data["intervention"][0]
|
| 255 |
+
# for cf_score in score_data["counterfactual_scores"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
# counterfactual = cf_score["counterfactual"][0]
|
| 257 |
# score = cf_score["score"]
|
| 258 |
|
| 259 |
+
# # Create key matching the expected column format
|
| 260 |
+
# key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
|
| 261 |
+
# results[key] = score
|
|
|
|
|
|
|
| 262 |
|
| 263 |
# return EvalResult_MIB_CAUSALGRAPH(
|
| 264 |
# eval_name=method_name,
|
|
|
|
| 266 |
# results=results
|
| 267 |
# )
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
+
# data = {}
|
| 271 |
+
# method_counters = defaultdict(int)
|
| 272 |
+
|
| 273 |
+
# for json_file in json_files:
|
| 274 |
+
# # Handle method name and duplicates
|
| 275 |
+
# method_name = json_file['method_name']
|
| 276 |
+
# method_counters[method_name] += 1
|
| 277 |
+
# unique_method_name = f"{method_name}_{method_counters[method_name]}"
|
| 278 |
|
| 279 |
+
# method_scores = []
|
|
|
|
|
|
|
| 280 |
|
| 281 |
+
# for result in json_file['results']:
|
| 282 |
+
# model = result['model_id']
|
|
|
|
|
|
|
| 283 |
|
| 284 |
+
# for task, scores in result['task_scores'].items():
|
| 285 |
+
# # Process each layer's data
|
| 286 |
+
# intervention_scores = defaultdict(list)
|
|
|
|
| 287 |
|
| 288 |
+
# for layer_data in scores:
|
| 289 |
+
# for intervention_data in layer_data['layer_scores']:
|
| 290 |
+
# # Calculate average score for counterfactuals
|
| 291 |
+
# avg_cf_score = np.mean([
|
| 292 |
+
# cf['score']
|
| 293 |
+
# for cf in intervention_data['counterfactual_scores']
|
| 294 |
+
# ])
|
| 295 |
+
|
| 296 |
+
# if np.isnan(avg_cf_score):
|
| 297 |
+
# avg_cf_score = 0.0
|
| 298 |
+
|
| 299 |
+
# # Group scores by intervention
|
| 300 |
+
# intervention_key = '_'.join(intervention_data['intervention'])
|
| 301 |
+
# intervention_scores[intervention_key].append(avg_cf_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
+
# # Average across layers for each intervention
|
| 304 |
+
# for intervention, layer_scores in intervention_scores.items():
|
| 305 |
+
# column = f"{model}_{task}_{intervention}"
|
| 306 |
+
# avg_score = np.mean(layer_scores) if layer_scores else 0.0
|
| 307 |
+
# method_scores.append((column, f"{avg_score:.3f}"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
+
# # Sort by column names for consistency
|
| 310 |
+
# method_scores.sort(key=lambda x: x[0])
|
| 311 |
+
# data[unique_method_name] = {
|
| 312 |
+
# col: score for col, score in method_scores
|
| 313 |
+
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
# return pd.DataFrame.from_dict(data, orient='index')
|
| 316 |
+
|
| 317 |
+
# def to_dict(self):
|
| 318 |
+
# """Converts the Eval Result to a dict for dataframe display"""
|
| 319 |
+
# data_dict = {
|
| 320 |
+
# "eval_name": self.eval_name,
|
| 321 |
+
# "Method": self.method_name,
|
| 322 |
+
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
+
# # Add all results directly
|
| 325 |
+
# data_dict.update(self.results)
|
| 326 |
|
| 327 |
+
# return data_dict
|
|
|
|
| 328 |
|
| 329 |
|
| 330 |
|
| 331 |
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 334 |
model_result_filepaths = []
|
| 335 |
|
|
|
|
| 350 |
|
| 351 |
# print(f"Found json files: {model_result_filepaths}")
|
| 352 |
|
| 353 |
+
method_counters = defaultdict(int)
|
| 354 |
+
dataframes = []
|
| 355 |
+
|
| 356 |
+
for json_file in model_result_filepaths:
|
| 357 |
try:
|
| 358 |
+
with open(filepath, 'r') as f:
|
| 359 |
+
json_data = json.load(f)
|
| 360 |
+
method_name = json_data['method_name']
|
| 361 |
+
method_counters[method_name] += 1
|
| 362 |
+
|
| 363 |
+
# Process single JSON file
|
| 364 |
+
df = process_single_json(json_data, method_counters[method_name])
|
| 365 |
+
dataframes.append(df)
|
| 366 |
except Exception as e:
|
| 367 |
+
print(f"Error processing {json_file}: {e}")
|
| 368 |
continue
|
| 369 |
+
|
| 370 |
+
# # Concatenate all DataFrames
|
| 371 |
+
# if dataframes:
|
| 372 |
+
# final_df = pd.concat(dataframes, axis=0)
|
| 373 |
+
# return final_df
|
| 374 |
+
# else:
|
| 375 |
+
# return pd.DataFrame()
|
| 376 |
+
return dataframes
|
| 377 |
|
| 378 |
|
| 379 |
|
src/populate.py
CHANGED
|
@@ -29,14 +29,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 29 |
|
| 30 |
df = pd.DataFrame.from_records(all_data_json)
|
| 31 |
df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
| 32 |
-
# df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
|
| 33 |
-
# df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
|
| 34 |
-
|
| 35 |
-
# print(f"df is {df}")
|
| 36 |
-
|
| 37 |
-
# df = df[cols].round(decimals=1)
|
| 38 |
-
|
| 39 |
-
# filter out if any of the benchmarks have not been produced
|
| 40 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 41 |
return df
|
| 42 |
|
|
@@ -69,23 +61,6 @@ def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols:
|
|
| 69 |
|
| 70 |
|
| 71 |
|
| 72 |
-
# def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
| 73 |
-
# """Aggregates rows with the same base method name by taking the max value for each column"""
|
| 74 |
-
# df_copy = df.copy()
|
| 75 |
-
|
| 76 |
-
# # Extract base method names (remove _2, _3, etc. suffixes)
|
| 77 |
-
# base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit()
|
| 78 |
-
# else name for name in df_copy.index]
|
| 79 |
-
# df_copy.index = base_methods
|
| 80 |
-
|
| 81 |
-
# # Convert scores to numeric values
|
| 82 |
-
# numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
|
| 83 |
-
|
| 84 |
-
# # Group by base method name and take the max
|
| 85 |
-
# aggregated_df = numeric_df.groupby(level=0).max().round(3)
|
| 86 |
-
|
| 87 |
-
# return aggregated_df
|
| 88 |
-
|
| 89 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
| 90 |
"""Aggregates rows with the same base method name by taking the max value for each column"""
|
| 91 |
df_copy = df.copy()
|
|
@@ -111,63 +86,6 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 111 |
|
| 112 |
return aggregated_df
|
| 113 |
|
| 114 |
-
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 115 |
-
# """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
| 116 |
-
# df_copy = df.copy()
|
| 117 |
-
|
| 118 |
-
# # Remove the Method column and eval_name if present
|
| 119 |
-
# columns_to_drop = ['Method', 'eval_name']
|
| 120 |
-
# df_copy = df_copy.drop(columns=[col for col in columns_to_drop if col in df_copy.columns])
|
| 121 |
-
|
| 122 |
-
# # Group columns by model_task
|
| 123 |
-
# model_task_groups = {}
|
| 124 |
-
# for col in df_copy.columns:
|
| 125 |
-
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
| 126 |
-
# if model_task not in model_task_groups:
|
| 127 |
-
# model_task_groups[model_task] = []
|
| 128 |
-
# model_task_groups[model_task].append(col)
|
| 129 |
-
|
| 130 |
-
# # Create new DataFrame with averaged intervention scores
|
| 131 |
-
# averaged_df = pd.DataFrame({
|
| 132 |
-
# model_task: df_copy[cols].mean(axis=1).round(3)
|
| 133 |
-
# for model_task, cols in model_task_groups.items()
|
| 134 |
-
# })
|
| 135 |
-
|
| 136 |
-
# return averaged_df
|
| 137 |
-
|
| 138 |
-
# def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 139 |
-
# """Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
| 140 |
-
# df_copy = df.copy()
|
| 141 |
-
|
| 142 |
-
# # Store Method column if it exists
|
| 143 |
-
# method_col = None
|
| 144 |
-
# if 'Method' in df_copy.columns:
|
| 145 |
-
# method_col = df_copy['Method']
|
| 146 |
-
# df_copy = df_copy.drop('Method', axis=1)
|
| 147 |
-
|
| 148 |
-
# # Remove eval_name if present
|
| 149 |
-
# if 'eval_name' in df_copy.columns:
|
| 150 |
-
# df_copy = df_copy.drop('eval_name', axis=1)
|
| 151 |
-
|
| 152 |
-
# # Group columns by model_task
|
| 153 |
-
# model_task_groups = {}
|
| 154 |
-
# for col in df_copy.columns:
|
| 155 |
-
# model_task = '_'.join(col.split('_')[:2]) # Get model_task part
|
| 156 |
-
# if model_task not in model_task_groups:
|
| 157 |
-
# model_task_groups[model_task] = []
|
| 158 |
-
# model_task_groups[model_task].append(col)
|
| 159 |
-
|
| 160 |
-
# # Create new DataFrame with averaged intervention scores
|
| 161 |
-
# averaged_df = pd.DataFrame({
|
| 162 |
-
# model_task: df_copy[cols].mean(axis=1).round(3)
|
| 163 |
-
# for model_task, cols in model_task_groups.items()
|
| 164 |
-
# })
|
| 165 |
-
|
| 166 |
-
# # Add Method column back
|
| 167 |
-
# if method_col is not None:
|
| 168 |
-
# averaged_df.insert(0, 'Method', method_col)
|
| 169 |
-
|
| 170 |
-
# return averaged_df
|
| 171 |
|
| 172 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 173 |
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
|
@@ -203,69 +121,32 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 203 |
|
| 204 |
return averaged_df
|
| 205 |
|
| 206 |
-
# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 207 |
-
# """Creates a dataframe from all the MIB causal graph experiment results"""
|
| 208 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 209 |
-
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 210 |
-
# print(f"raw_data is {raw_data}")
|
| 211 |
-
|
| 212 |
-
# # Convert each result to dict format for detailed df
|
| 213 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
| 214 |
-
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
| 215 |
-
# print(f"detailed_df is: {detailed_df}")
|
| 216 |
-
|
| 217 |
-
# # Create and print other views for debugging/reference
|
| 218 |
-
# aggregated_df = aggregate_methods(detailed_df)
|
| 219 |
-
# print(f"aggregated_df is: {aggregated_df}")
|
| 220 |
-
|
| 221 |
-
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
| 222 |
-
# print(f"intervention_averaged_df is: {intervention_averaged_df}")
|
| 223 |
-
|
| 224 |
-
# # Only return detailed_df for display
|
| 225 |
-
# return detailed_df
|
| 226 |
|
| 227 |
-
# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 228 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 229 |
-
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 230 |
-
|
| 231 |
-
# # Convert each result to dict format for detailed df
|
| 232 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
| 233 |
-
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
| 234 |
-
# print("Columns in detailed_df:", detailed_df.columns.tolist()) # Print actual columns
|
| 235 |
-
|
| 236 |
-
# # Create aggregated df
|
| 237 |
-
# aggregated_df = aggregate_methods(detailed_df)
|
| 238 |
-
# print("Columns in aggregated_df:", aggregated_df.columns.tolist())
|
| 239 |
-
|
| 240 |
-
# # Create intervention-averaged df
|
| 241 |
-
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
| 242 |
-
# print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
|
| 243 |
-
|
| 244 |
-
# return detailed_df, aggregated_df, intervention_averaged_df
|
| 245 |
|
| 246 |
-
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str
|
| 247 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
#
|
| 251 |
-
|
| 252 |
-
|
|
|
|
| 253 |
|
| 254 |
# Print the actual columns for debugging
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
# Rename columns to match schema
|
| 258 |
-
column_mapping = {}
|
| 259 |
-
for col in detailed_df.columns:
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
detailed_df = detailed_df.rename(columns=column_mapping)
|
| 269 |
|
| 270 |
# Create aggregated df
|
| 271 |
aggregated_df = aggregate_methods(detailed_df)
|
|
|
|
| 29 |
|
| 30 |
df = pd.DataFrame.from_records(all_data_json)
|
| 31 |
df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 33 |
return df
|
| 34 |
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
| 65 |
"""Aggregates rows with the same base method name by taking the max value for each column"""
|
| 66 |
df_copy = df.copy()
|
|
|
|
| 86 |
|
| 87 |
return aggregated_df
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 91 |
"""Creates a DataFrame where columns are model_task and cells are averaged over interventions"""
|
|
|
|
| 121 |
|
| 122 |
return averaged_df
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 127 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 128 |
+
|
| 129 |
+
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 130 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
| 131 |
+
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
| 132 |
+
|
| 133 |
+
detailed_df = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 134 |
|
| 135 |
# Print the actual columns for debugging
|
| 136 |
+
print("Original columns:", detailed_df.columns.tolist())
|
| 137 |
+
|
| 138 |
+
# # Rename columns to match schema
|
| 139 |
+
# column_mapping = {}
|
| 140 |
+
# for col in detailed_df.columns:
|
| 141 |
+
# if col in ['eval_name', 'Method']:
|
| 142 |
+
# continue
|
| 143 |
+
# # Ensure consistent casing for the column names
|
| 144 |
+
# new_col = col.replace('Qwen2ForCausalLM', 'qwen2forcausallm') \
|
| 145 |
+
# .replace('Gemma2ForCausalLM', 'gemma2forcausallm') \
|
| 146 |
+
# .replace('LlamaForCausalLM', 'llamaforcausallm')
|
| 147 |
+
# column_mapping[col] = new_col
|
| 148 |
+
|
| 149 |
+
# detailed_df = detailed_df.rename(columns=column_mapping)
|
| 150 |
|
| 151 |
# Create aggregated df
|
| 152 |
aggregated_df = aggregate_methods(detailed_df)
|