Amber / eval_mmlu.json
omkarenator's picture
Upload folder using huggingface_hub
e651c65
raw
history blame
14.2 kB
{
"results": {
"hendrycksTest-abstract_algebra": {
"acc": 0.27,
"acc_stderr": 0.04461960433384741,
"acc_norm": 0.27,
"acc_norm_stderr": 0.04461960433384741
},
"hendrycksTest-anatomy": {
"acc": 0.34814814814814815,
"acc_stderr": 0.041153246103369526,
"acc_norm": 0.34814814814814815,
"acc_norm_stderr": 0.041153246103369526
},
"hendrycksTest-astronomy": {
"acc": 0.29605263157894735,
"acc_stderr": 0.037150621549989056,
"acc_norm": 0.29605263157894735,
"acc_norm_stderr": 0.037150621549989056
},
"hendrycksTest-business_ethics": {
"acc": 0.38,
"acc_stderr": 0.048783173121456316,
"acc_norm": 0.38,
"acc_norm_stderr": 0.048783173121456316
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.30943396226415093,
"acc_stderr": 0.028450154794118627,
"acc_norm": 0.30943396226415093,
"acc_norm_stderr": 0.028450154794118627
},
"hendrycksTest-college_biology": {
"acc": 0.3125,
"acc_stderr": 0.038760854559127644,
"acc_norm": 0.3125,
"acc_norm_stderr": 0.038760854559127644
},
"hendrycksTest-college_chemistry": {
"acc": 0.19,
"acc_stderr": 0.039427724440366234,
"acc_norm": 0.19,
"acc_norm_stderr": 0.039427724440366234
},
"hendrycksTest-college_computer_science": {
"acc": 0.37,
"acc_stderr": 0.04852365870939099,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"hendrycksTest-college_mathematics": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-college_medicine": {
"acc": 0.2774566473988439,
"acc_stderr": 0.03414014007044036,
"acc_norm": 0.2774566473988439,
"acc_norm_stderr": 0.03414014007044036
},
"hendrycksTest-college_physics": {
"acc": 0.20588235294117646,
"acc_stderr": 0.04023382273617747,
"acc_norm": 0.20588235294117646,
"acc_norm_stderr": 0.04023382273617747
},
"hendrycksTest-computer_security": {
"acc": 0.44,
"acc_stderr": 0.04988876515698589,
"acc_norm": 0.44,
"acc_norm_stderr": 0.04988876515698589
},
"hendrycksTest-conceptual_physics": {
"acc": 0.23404255319148937,
"acc_stderr": 0.02767845257821239,
"acc_norm": 0.23404255319148937,
"acc_norm_stderr": 0.02767845257821239
},
"hendrycksTest-econometrics": {
"acc": 0.2719298245614035,
"acc_stderr": 0.04185774424022056,
"acc_norm": 0.2719298245614035,
"acc_norm_stderr": 0.04185774424022056
},
"hendrycksTest-electrical_engineering": {
"acc": 0.3103448275862069,
"acc_stderr": 0.03855289616378948,
"acc_norm": 0.3103448275862069,
"acc_norm_stderr": 0.03855289616378948
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2671957671957672,
"acc_stderr": 0.022789673145776564,
"acc_norm": 0.2671957671957672,
"acc_norm_stderr": 0.022789673145776564
},
"hendrycksTest-formal_logic": {
"acc": 0.31746031746031744,
"acc_stderr": 0.04163453031302859,
"acc_norm": 0.31746031746031744,
"acc_norm_stderr": 0.04163453031302859
},
"hendrycksTest-global_facts": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-high_school_biology": {
"acc": 0.267741935483871,
"acc_stderr": 0.025189006660212385,
"acc_norm": 0.267741935483871,
"acc_norm_stderr": 0.025189006660212385
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.2512315270935961,
"acc_stderr": 0.030516530732694433,
"acc_norm": 0.2512315270935961,
"acc_norm_stderr": 0.030516530732694433
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.32,
"acc_stderr": 0.04688261722621504,
"acc_norm": 0.32,
"acc_norm_stderr": 0.04688261722621504
},
"hendrycksTest-high_school_european_history": {
"acc": 0.30303030303030304,
"acc_stderr": 0.035886248000917075,
"acc_norm": 0.30303030303030304,
"acc_norm_stderr": 0.035886248000917075
},
"hendrycksTest-high_school_geography": {
"acc": 0.31313131313131315,
"acc_stderr": 0.03304205087813653,
"acc_norm": 0.31313131313131315,
"acc_norm_stderr": 0.03304205087813653
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.25906735751295334,
"acc_stderr": 0.03161877917935409,
"acc_norm": 0.25906735751295334,
"acc_norm_stderr": 0.03161877917935409
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2128205128205128,
"acc_stderr": 0.02075242372212802,
"acc_norm": 0.2128205128205128,
"acc_norm_stderr": 0.02075242372212802
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.24444444444444444,
"acc_stderr": 0.02620276653465215,
"acc_norm": 0.24444444444444444,
"acc_norm_stderr": 0.02620276653465215
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.2184873949579832,
"acc_stderr": 0.026841514322958945,
"acc_norm": 0.2184873949579832,
"acc_norm_stderr": 0.026841514322958945
},
"hendrycksTest-high_school_physics": {
"acc": 0.2847682119205298,
"acc_stderr": 0.03684881521389023,
"acc_norm": 0.2847682119205298,
"acc_norm_stderr": 0.03684881521389023
},
"hendrycksTest-high_school_psychology": {
"acc": 0.30642201834862387,
"acc_stderr": 0.019765517220458523,
"acc_norm": 0.30642201834862387,
"acc_norm_stderr": 0.019765517220458523
},
"hendrycksTest-high_school_statistics": {
"acc": 0.25,
"acc_stderr": 0.029531221160930918,
"acc_norm": 0.25,
"acc_norm_stderr": 0.029531221160930918
},
"hendrycksTest-high_school_us_history": {
"acc": 0.2549019607843137,
"acc_stderr": 0.030587591351604257,
"acc_norm": 0.2549019607843137,
"acc_norm_stderr": 0.030587591351604257
},
"hendrycksTest-high_school_world_history": {
"acc": 0.2911392405063291,
"acc_stderr": 0.029571601065753374,
"acc_norm": 0.2911392405063291,
"acc_norm_stderr": 0.029571601065753374
},
"hendrycksTest-human_aging": {
"acc": 0.21524663677130046,
"acc_stderr": 0.02758406660220826,
"acc_norm": 0.21524663677130046,
"acc_norm_stderr": 0.02758406660220826
},
"hendrycksTest-human_sexuality": {
"acc": 0.3053435114503817,
"acc_stderr": 0.04039314978724562,
"acc_norm": 0.3053435114503817,
"acc_norm_stderr": 0.04039314978724562
},
"hendrycksTest-international_law": {
"acc": 0.36363636363636365,
"acc_stderr": 0.04391326286724071,
"acc_norm": 0.36363636363636365,
"acc_norm_stderr": 0.04391326286724071
},
"hendrycksTest-jurisprudence": {
"acc": 0.32407407407407407,
"acc_stderr": 0.04524596007030049,
"acc_norm": 0.32407407407407407,
"acc_norm_stderr": 0.04524596007030049
},
"hendrycksTest-logical_fallacies": {
"acc": 0.2085889570552147,
"acc_stderr": 0.031921934489347235,
"acc_norm": 0.2085889570552147,
"acc_norm_stderr": 0.031921934489347235
},
"hendrycksTest-machine_learning": {
"acc": 0.21428571428571427,
"acc_stderr": 0.03894641120044793,
"acc_norm": 0.21428571428571427,
"acc_norm_stderr": 0.03894641120044793
},
"hendrycksTest-management": {
"acc": 0.2524271844660194,
"acc_stderr": 0.04301250399690875,
"acc_norm": 0.2524271844660194,
"acc_norm_stderr": 0.04301250399690875
},
"hendrycksTest-marketing": {
"acc": 0.32051282051282054,
"acc_stderr": 0.030572811310299604,
"acc_norm": 0.32051282051282054,
"acc_norm_stderr": 0.030572811310299604
},
"hendrycksTest-medical_genetics": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.27,
"acc_norm_stderr": 0.044619604333847394
},
"hendrycksTest-miscellaneous": {
"acc": 0.3167305236270754,
"acc_stderr": 0.01663556642771247,
"acc_norm": 0.3167305236270754,
"acc_norm_stderr": 0.01663556642771247
},
"hendrycksTest-moral_disputes": {
"acc": 0.30057803468208094,
"acc_stderr": 0.0246853168672578,
"acc_norm": 0.30057803468208094,
"acc_norm_stderr": 0.0246853168672578
},
"hendrycksTest-moral_scenarios": {
"acc": 0.27262569832402234,
"acc_stderr": 0.014893391735249588,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-nutrition": {
"acc": 0.3137254901960784,
"acc_stderr": 0.02656892101545716,
"acc_norm": 0.3137254901960784,
"acc_norm_stderr": 0.02656892101545716
},
"hendrycksTest-philosophy": {
"acc": 0.34726688102893893,
"acc_stderr": 0.027040745502307336,
"acc_norm": 0.34726688102893893,
"acc_norm_stderr": 0.027040745502307336
},
"hendrycksTest-prehistory": {
"acc": 0.33024691358024694,
"acc_stderr": 0.026168298456732852,
"acc_norm": 0.33024691358024694,
"acc_norm_stderr": 0.026168298456732852
},
"hendrycksTest-professional_accounting": {
"acc": 0.2765957446808511,
"acc_stderr": 0.026684564340460983,
"acc_norm": 0.2765957446808511,
"acc_norm_stderr": 0.026684564340460983
},
"hendrycksTest-professional_law": {
"acc": 0.2711864406779661,
"acc_stderr": 0.011354581451622985,
"acc_norm": 0.2711864406779661,
"acc_norm_stderr": 0.011354581451622985
},
"hendrycksTest-professional_medicine": {
"acc": 0.20955882352941177,
"acc_stderr": 0.02472311040767705,
"acc_norm": 0.20955882352941177,
"acc_norm_stderr": 0.02472311040767705
},
"hendrycksTest-professional_psychology": {
"acc": 0.28921568627450983,
"acc_stderr": 0.018342529845275908,
"acc_norm": 0.28921568627450983,
"acc_norm_stderr": 0.018342529845275908
},
"hendrycksTest-public_relations": {
"acc": 0.34545454545454546,
"acc_stderr": 0.04554619617541054,
"acc_norm": 0.34545454545454546,
"acc_norm_stderr": 0.04554619617541054
},
"hendrycksTest-security_studies": {
"acc": 0.2653061224489796,
"acc_stderr": 0.028263889943784596,
"acc_norm": 0.2653061224489796,
"acc_norm_stderr": 0.028263889943784596
},
"hendrycksTest-sociology": {
"acc": 0.22885572139303484,
"acc_stderr": 0.02970528405677244,
"acc_norm": 0.22885572139303484,
"acc_norm_stderr": 0.02970528405677244
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.39,
"acc_stderr": 0.04902071300001975,
"acc_norm": 0.39,
"acc_norm_stderr": 0.04902071300001975
},
"hendrycksTest-virology": {
"acc": 0.29518072289156627,
"acc_stderr": 0.035509201856896294,
"acc_norm": 0.29518072289156627,
"acc_norm_stderr": 0.035509201856896294
},
"hendrycksTest-world_religions": {
"acc": 0.34502923976608185,
"acc_stderr": 0.036459813773888065,
"acc_norm": 0.34502923976608185,
"acc_norm_stderr": 0.036459813773888065
}
},
"versions": {
"hendrycksTest-abstract_algebra": 1,
"hendrycksTest-anatomy": 1,
"hendrycksTest-astronomy": 1,
"hendrycksTest-business_ethics": 1,
"hendrycksTest-clinical_knowledge": 1,
"hendrycksTest-college_biology": 1,
"hendrycksTest-college_chemistry": 1,
"hendrycksTest-college_computer_science": 1,
"hendrycksTest-college_mathematics": 1,
"hendrycksTest-college_medicine": 1,
"hendrycksTest-college_physics": 1,
"hendrycksTest-computer_security": 1,
"hendrycksTest-conceptual_physics": 1,
"hendrycksTest-econometrics": 1,
"hendrycksTest-electrical_engineering": 1,
"hendrycksTest-elementary_mathematics": 1,
"hendrycksTest-formal_logic": 1,
"hendrycksTest-global_facts": 1,
"hendrycksTest-high_school_biology": 1,
"hendrycksTest-high_school_chemistry": 1,
"hendrycksTest-high_school_computer_science": 1,
"hendrycksTest-high_school_european_history": 1,
"hendrycksTest-high_school_geography": 1,
"hendrycksTest-high_school_government_and_politics": 1,
"hendrycksTest-high_school_macroeconomics": 1,
"hendrycksTest-high_school_mathematics": 1,
"hendrycksTest-high_school_microeconomics": 1,
"hendrycksTest-high_school_physics": 1,
"hendrycksTest-high_school_psychology": 1,
"hendrycksTest-high_school_statistics": 1,
"hendrycksTest-high_school_us_history": 1,
"hendrycksTest-high_school_world_history": 1,
"hendrycksTest-human_aging": 1,
"hendrycksTest-human_sexuality": 1,
"hendrycksTest-international_law": 1,
"hendrycksTest-jurisprudence": 1,
"hendrycksTest-logical_fallacies": 1,
"hendrycksTest-machine_learning": 1,
"hendrycksTest-management": 1,
"hendrycksTest-marketing": 1,
"hendrycksTest-medical_genetics": 1,
"hendrycksTest-miscellaneous": 1,
"hendrycksTest-moral_disputes": 1,
"hendrycksTest-moral_scenarios": 1,
"hendrycksTest-nutrition": 1,
"hendrycksTest-philosophy": 1,
"hendrycksTest-prehistory": 1,
"hendrycksTest-professional_accounting": 1,
"hendrycksTest-professional_law": 1,
"hendrycksTest-professional_medicine": 1,
"hendrycksTest-professional_psychology": 1,
"hendrycksTest-public_relations": 1,
"hendrycksTest-security_studies": 1,
"hendrycksTest-sociology": 1,
"hendrycksTest-us_foreign_policy": 1,
"hendrycksTest-virology": 1,
"hendrycksTest-world_religions": 1
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=./workdir_7b_16mix/ckpt_356",
"num_fewshot": 5,
"batch_size": "1",
"batch_sizes": [],
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}