nm-research commited on
Commit
1b32bc2
·
verified ·
1 Parent(s): 3af7783

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -20
README.md CHANGED
@@ -231,52 +231,63 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2 and HumanEval_
231
  <td><b>78.74</b></td>
232
  <td><b>99.87</b></td>
233
  </tr>
234
-
235
  <!-- OpenLLM V2 -->
236
  <tr>
237
- <td rowspan="7"><b>OpenLLM V2</b></td>
238
  <td>BBH</td>
239
- <td></td>
240
  <td>63.81</td>
241
- <td></td>
242
  </tr>
243
  <tr>
244
  <td>MMLU-Pro</td>
245
- <td></td>
246
  <td>57.99</td>
247
- <td></td>
248
  </tr>
249
  <tr>
250
  <td>MuSR</td>
251
- <td></td>
252
  <td>42.99</td>
253
- <td></td>
254
  </tr>
255
  <tr>
256
  <td>IFEval</td>
257
- <td></td>
258
  <td>88.25</td>
259
- <td></td>
 
260
  </tr>
261
  <tr>
262
  <td>GPQA</td>
263
- <td></td>
264
  <td>28.94</td>
265
- <td></td>
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  </tr>
267
  <tr>
268
- <td>Math Hard</td>
269
- <td></td>
270
- <td>56.80</td>
271
- <td></td>
272
  </tr>
273
  <tr>
274
  <td><b>Average</b></td>
275
- <td></td>
276
- <td><b>56.46</b></td>
277
- <td></td>
278
  </tr>
279
-
280
  <!-- Coding -->
281
  <tr>
282
  <td rowspan="1"><b>Coding</b></td>
 
231
  <td><b>78.74</b></td>
232
  <td><b>99.87</b></td>
233
  </tr>
 
234
  <!-- OpenLLM V2 -->
235
  <tr>
236
+ <td rowspan="6"><b>OpenLLM V2</b></td>
237
  <td>BBH</td>
238
+ <td>63.67</td>
239
  <td>63.81</td>
240
+ <td>100.22</td>
241
  </tr>
242
  <tr>
243
  <td>MMLU-Pro</td>
244
+ <td>58.23</td>
245
  <td>57.99</td>
246
+ <td>99.59</td>
247
  </tr>
248
  <tr>
249
  <td>MuSR</td>
250
+ <td>43.25</td>
251
  <td>42.99</td>
252
+ <td>99.40</td>
253
  </tr>
254
  <tr>
255
  <td>IFEval</td>
 
256
  <td>88.25</td>
257
+ <td>88.25</td>
258
+ <td>100.00</td>
259
  </tr>
260
  <tr>
261
  <td>GPQA</td>
262
+ <td>29.28</td>
263
  <td>28.94</td>
264
+ <td>98.84</td>
265
+ </tr>
266
+ <tr>
267
+ <td><b>Average</b></td>
268
+ <td><b>56.54</b></td>
269
+ <td><b>56.40</b></td>
270
+ <td><b>99.75</b></td>
271
+ </tr>
272
+ <tr>
273
+ <td rowspan="3"><b>Reasoning</b></td>
274
+ <td>GPQA (Diamond, 0-shot)</td>
275
+ <td>72.22</td>
276
+ <td>69.19</td>
277
+ <td>95.80</td>
278
  </tr>
279
  <tr>
280
+ <td>Math-500 (0-shot)</td>
281
+ <td>95.00</td>
282
+ <td>94.20</td>
283
+ <td>99.16</td>
284
  </tr>
285
  <tr>
286
  <td><b>Average</b></td>
287
+ <td><b>83.61</b></td>
288
+ <td><b>81.70</b></td>
289
+ <td><b>97.72</b></td>
290
  </tr>
 
291
  <!-- Coding -->
292
  <tr>
293
  <td rowspan="1"><b>Coding</b></td>