Update README.md
Browse files
README.md
CHANGED
|
@@ -231,52 +231,63 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2 and HumanEval_
|
|
| 231 |
<td><b>78.74</b></td>
|
| 232 |
<td><b>99.87</b></td>
|
| 233 |
</tr>
|
| 234 |
-
|
| 235 |
<!-- OpenLLM V2 -->
|
| 236 |
<tr>
|
| 237 |
-
<td rowspan="
|
| 238 |
<td>BBH</td>
|
| 239 |
-
<td
|
| 240 |
<td>63.81</td>
|
| 241 |
-
<td
|
| 242 |
</tr>
|
| 243 |
<tr>
|
| 244 |
<td>MMLU-Pro</td>
|
| 245 |
-
<td
|
| 246 |
<td>57.99</td>
|
| 247 |
-
<td
|
| 248 |
</tr>
|
| 249 |
<tr>
|
| 250 |
<td>MuSR</td>
|
| 251 |
-
<td
|
| 252 |
<td>42.99</td>
|
| 253 |
-
<td
|
| 254 |
</tr>
|
| 255 |
<tr>
|
| 256 |
<td>IFEval</td>
|
| 257 |
-
<td></td>
|
| 258 |
<td>88.25</td>
|
| 259 |
-
<td
|
|
|
|
| 260 |
</tr>
|
| 261 |
<tr>
|
| 262 |
<td>GPQA</td>
|
| 263 |
-
<td
|
| 264 |
<td>28.94</td>
|
| 265 |
-
<td
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
</tr>
|
| 267 |
<tr>
|
| 268 |
-
<td>Math
|
| 269 |
-
<td
|
| 270 |
-
<td>
|
| 271 |
-
<td
|
| 272 |
</tr>
|
| 273 |
<tr>
|
| 274 |
<td><b>Average</b></td>
|
| 275 |
-
<td></td>
|
| 276 |
-
<td><b>
|
| 277 |
-
<td></td>
|
| 278 |
</tr>
|
| 279 |
-
|
| 280 |
<!-- Coding -->
|
| 281 |
<tr>
|
| 282 |
<td rowspan="1"><b>Coding</b></td>
|
|
|
|
| 231 |
<td><b>78.74</b></td>
|
| 232 |
<td><b>99.87</b></td>
|
| 233 |
</tr>
|
|
|
|
| 234 |
<!-- OpenLLM V2 -->
|
| 235 |
<tr>
|
| 236 |
+
<td rowspan="6"><b>OpenLLM V2</b></td>
|
| 237 |
<td>BBH</td>
|
| 238 |
+
<td>63.67</td>
|
| 239 |
<td>63.81</td>
|
| 240 |
+
<td>100.22</td>
|
| 241 |
</tr>
|
| 242 |
<tr>
|
| 243 |
<td>MMLU-Pro</td>
|
| 244 |
+
<td>58.23</td>
|
| 245 |
<td>57.99</td>
|
| 246 |
+
<td>99.59</td>
|
| 247 |
</tr>
|
| 248 |
<tr>
|
| 249 |
<td>MuSR</td>
|
| 250 |
+
<td>43.25</td>
|
| 251 |
<td>42.99</td>
|
| 252 |
+
<td>99.40</td>
|
| 253 |
</tr>
|
| 254 |
<tr>
|
| 255 |
<td>IFEval</td>
|
|
|
|
| 256 |
<td>88.25</td>
|
| 257 |
+
<td>88.25</td>
|
| 258 |
+
<td>100.00</td>
|
| 259 |
</tr>
|
| 260 |
<tr>
|
| 261 |
<td>GPQA</td>
|
| 262 |
+
<td>29.28</td>
|
| 263 |
<td>28.94</td>
|
| 264 |
+
<td>98.84</td>
|
| 265 |
+
</tr>
|
| 266 |
+
<tr>
|
| 267 |
+
<td><b>Average</b></td>
|
| 268 |
+
<td><b>56.54</b></td>
|
| 269 |
+
<td><b>56.40</b></td>
|
| 270 |
+
<td><b>99.75</b></td>
|
| 271 |
+
</tr>
|
| 272 |
+
<tr>
|
| 273 |
+
<td rowspan="3"><b>Reasoning</b></td>
|
| 274 |
+
<td>GPQA (Diamond, 0-shot)</td>
|
| 275 |
+
<td>72.22</td>
|
| 276 |
+
<td>69.19</td>
|
| 277 |
+
<td>95.80</td>
|
| 278 |
</tr>
|
| 279 |
<tr>
|
| 280 |
+
<td>Math-500 (0-shot)</td>
|
| 281 |
+
<td>95.00</td>
|
| 282 |
+
<td>94.20</td>
|
| 283 |
+
<td>99.16</td>
|
| 284 |
</tr>
|
| 285 |
<tr>
|
| 286 |
<td><b>Average</b></td>
|
| 287 |
+
<td><b>83.61</b></td>
|
| 288 |
+
<td><b>81.70</b></td>
|
| 289 |
+
<td><b>97.72</b></td>
|
| 290 |
</tr>
|
|
|
|
| 291 |
<!-- Coding -->
|
| 292 |
<tr>
|
| 293 |
<td rowspan="1"><b>Coding</b></td>
|