Update README.md
Browse filesadded BBH scores.
README.md
CHANGED
|
@@ -215,8 +215,8 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 215 |
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
|
| 216 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
| 217 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
| 218 |
-
|
| 219 |
-
<th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="
|
| 220 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
| 221 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
| 222 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
|
@@ -231,7 +231,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 231 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
|
| 232 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
| 233 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
| 234 |
-
|
| 235 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
|
| 236 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
| 237 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
|
@@ -246,7 +246,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 246 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
|
| 247 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
| 248 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
| 249 |
-
|
| 250 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
|
| 251 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
| 252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
|
@@ -261,7 +261,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 261 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
|
| 262 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
| 263 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
| 264 |
-
|
| 265 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
|
| 266 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
| 267 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
|
@@ -277,7 +277,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 277 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
|
| 278 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
| 279 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
| 280 |
-
|
| 281 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
|
| 282 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
| 283 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
|
@@ -293,7 +293,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 293 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
|
| 294 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
| 295 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
| 296 |
-
|
| 297 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
|
| 298 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
| 299 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
|
@@ -309,7 +309,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 309 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
|
| 310 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
| 311 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
| 312 |
-
|
| 313 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
|
| 314 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
| 315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
|
@@ -325,7 +325,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 325 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
|
| 326 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
| 327 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
| 328 |
-
|
| 329 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
|
| 330 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
| 331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
|
@@ -340,7 +340,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 340 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
|
| 341 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
| 342 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
| 343 |
-
|
| 344 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
|
| 345 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
| 346 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
|
@@ -356,7 +356,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 356 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
|
| 357 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
| 358 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
| 359 |
-
|
| 360 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
|
| 361 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
| 362 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
|
@@ -371,7 +371,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
| 371 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
|
| 372 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
| 373 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
| 374 |
-
|
| 375 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
|
| 376 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
| 377 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
|
@@ -528,4 +528,5 @@ Granite-3.3-2B-Instruct builds upon Granite-3.3-2B-Base, leveraging both permiss
|
|
| 528 |
|
| 529 |
|
| 530 |
<p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
|
|
|
|
| 531 |
<p><a href="#fnref2" title="Jump back to reference">[2]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
|
|
|
|
| 215 |
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
|
| 216 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
| 217 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
| 218 |
+
<th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard<sup id="fnref2"><a href="#fn2">2</a></sup></th>
|
| 219 |
+
<th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref3"><a href="#fn3">3</a></sup></th>
|
| 220 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
| 221 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
| 222 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
|
|
|
| 231 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
|
| 232 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
| 233 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
| 234 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.82</td>
|
| 235 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
|
| 236 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
| 237 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
|
|
|
| 246 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
|
| 247 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
| 248 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
| 249 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.39</td>
|
| 250 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
|
| 251 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
| 252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
|
|
|
| 261 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
|
| 262 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
| 263 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
| 264 |
+
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 63.91 </td>
|
| 265 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
|
| 266 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
| 267 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
|
|
|
| 277 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
|
| 278 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
| 279 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
| 280 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.43</td>
|
| 281 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
|
| 282 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
| 283 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
|
|
|
| 293 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
|
| 294 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
| 295 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
| 296 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.39</td>
|
| 297 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
|
| 298 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
| 299 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
|
|
|
| 309 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
|
| 310 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
| 311 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
| 312 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.19</td>
|
| 313 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
|
| 314 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
| 315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
|
|
|
| 325 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
|
| 326 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
| 327 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
| 328 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.38</td>
|
| 329 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
|
| 330 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
| 331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
|
|
|
| 340 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
|
| 341 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
| 342 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
| 343 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.87</td>
|
| 344 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
|
| 345 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
| 346 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
|
|
|
| 356 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
|
| 357 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
| 358 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
| 359 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.86</td>
|
| 360 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
|
| 361 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
| 362 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
|
|
|
| 371 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
|
| 372 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
| 373 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
| 374 |
+
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 69.13 </td>
|
| 375 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
|
| 376 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
| 377 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
|
|
|
| 528 |
|
| 529 |
|
| 530 |
<p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
|
| 531 |
+
<p><a href="#fnref2" title="Jump back to reference">[2]</a> Added regex for more efficient asnwer extraction.</a></p>
|
| 532 |
<p><a href="#fnref2" title="Jump back to reference">[2]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
|