Bram Vanroy
commited on
Commit
Β·
5693ee5
1
Parent(s):
0ae29b2
revision for Dutch only
Browse files- .gitignore +107 -126
- app.py +58 -91
- content.py +15 -23
- css.py +0 -13
- evals/arc/arc_nl_Llama-2-7b-chat-hf.json +6 -6
- evals/arc/arc_nl_Llama-2-7b-hf.json +6 -6
- evals/arc/{arc_nl_Mistral-7B-v0.1.json β arc_nl_Orca-2-7b.json} +6 -6
- evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json β arc/arc_nl_gpt2-large-dutch.json} +8 -8
- evals/arc/arc_nl_gpt2-medium-dutch.json +23 -0
- evals/arc/arc_nl_zephyr-7b-beta.json +6 -6
- evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json +6 -6
- evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json +6 -6
- evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +6 -6
- evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json β hellaswag_nl_Orca-2-7b.json} +6 -6
- evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json +23 -0
- evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json +23 -0
- evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json +23 -0
- evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json β mmlu/mmlu_nl_Mistral-7B-v0.1.json} +8 -8
- evals/mmlu/mmlu_nl_gpt2-large-dutch.json +23 -0
- evals/mmlu/mmlu_nl_gpt2-medium-dutch.json +23 -0
- evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json +0 -23
- evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json +6 -6
- evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json +4 -4
- evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json β truthfulqa_nl_Orca-2-7b.json} +6 -6
- evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json +0 -23
- evals/truthfulqa/truthfulqa_nl_falcon-40b.json +0 -23
- evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json β truthfulqa_nl_gpt2-large-dutch.json} +6 -6
- evals/truthfulqa/{truthfulqa_nl-falcon-40b.json β truthfulqa_nl_gpt2-medium-dutch.json} +6 -6
- evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json +0 -23
- evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json +0 -23
.gitignore
CHANGED
|
@@ -1,92 +1,42 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
wandb*
|
| 5 |
-
Pipfile*
|
| 6 |
-
data/*
|
| 7 |
-
muss
|
| 8 |
-
models/*
|
| 9 |
-
*config.json
|
| 10 |
-
|
| 11 |
-
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
| 12 |
-
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
| 13 |
-
|
| 14 |
-
.idea/
|
| 15 |
-
# User-specific stuff
|
| 16 |
-
.idea/**/workspace.xml
|
| 17 |
-
.idea/**/tasks.xml
|
| 18 |
-
.idea/**/usage.statistics.xml
|
| 19 |
-
.idea/**/dictionaries
|
| 20 |
-
.idea/**/shelf
|
| 21 |
-
|
| 22 |
-
# AWS User-specific
|
| 23 |
-
.idea/**/aws.xml
|
| 24 |
-
|
| 25 |
-
# Generated files
|
| 26 |
-
.idea/**/contentModel.xml
|
| 27 |
-
|
| 28 |
-
# Sensitive or high-churn files
|
| 29 |
-
.idea/**/dataSources/
|
| 30 |
-
.idea/**/dataSources.ids
|
| 31 |
-
.idea/**/dataSources.local.xml
|
| 32 |
-
.idea/**/sqlDataSources.xml
|
| 33 |
-
.idea/**/dynamic.xml
|
| 34 |
-
.idea/**/uiDesigner.xml
|
| 35 |
-
.idea/**/dbnavigator.xml
|
| 36 |
-
|
| 37 |
-
# Gradle
|
| 38 |
-
.idea/**/gradle.xml
|
| 39 |
-
.idea/**/libraries
|
| 40 |
-
|
| 41 |
-
# Gradle and Maven with auto-import
|
| 42 |
-
# When using Gradle or Maven with auto-import, you should exclude module files,
|
| 43 |
-
# since they will be recreated, and may cause churn. Uncomment if using
|
| 44 |
-
# auto-import.
|
| 45 |
-
# .idea/artifacts
|
| 46 |
-
# .idea/compiler.xml
|
| 47 |
-
# .idea/jarRepositories.xml
|
| 48 |
-
# .idea/modules.xml
|
| 49 |
-
# .idea/*.iml
|
| 50 |
-
# .idea/modules
|
| 51 |
-
# *.iml
|
| 52 |
-
# *.ipr
|
| 53 |
-
|
| 54 |
-
# CMake
|
| 55 |
-
cmake-build-*/
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
out/
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
crashlytics-build.properties
|
| 82 |
-
fabric.properties
|
| 83 |
|
| 84 |
-
# Editor-based Rest Client
|
| 85 |
-
.idea/httpRequests
|
| 86 |
|
| 87 |
-
#
|
| 88 |
-
|
|
|
|
| 89 |
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# Byte-compiled / optimized / DLL files
|
| 92 |
__pycache__/
|
|
@@ -110,7 +60,6 @@ parts/
|
|
| 110 |
sdist/
|
| 111 |
var/
|
| 112 |
wheels/
|
| 113 |
-
share/python-wheels/
|
| 114 |
*.egg-info/
|
| 115 |
.installed.cfg
|
| 116 |
*.egg
|
|
@@ -129,17 +78,14 @@ pip-delete-this-directory.txt
|
|
| 129 |
# Unit test / coverage reports
|
| 130 |
htmlcov/
|
| 131 |
.tox/
|
| 132 |
-
.nox/
|
| 133 |
.coverage
|
| 134 |
.coverage.*
|
| 135 |
.cache
|
| 136 |
nosetests.xml
|
| 137 |
coverage.xml
|
| 138 |
*.cover
|
| 139 |
-
*.py,cover
|
| 140 |
.hypothesis/
|
| 141 |
.pytest_cache/
|
| 142 |
-
cover/
|
| 143 |
|
| 144 |
# Translations
|
| 145 |
*.mo
|
|
@@ -149,7 +95,6 @@ cover/
|
|
| 149 |
*.log
|
| 150 |
local_settings.py
|
| 151 |
db.sqlite3
|
| 152 |
-
db.sqlite3-journal
|
| 153 |
|
| 154 |
# Flask stuff:
|
| 155 |
instance/
|
|
@@ -162,41 +107,16 @@ instance/
|
|
| 162 |
docs/_build/
|
| 163 |
|
| 164 |
# PyBuilder
|
| 165 |
-
.pybuilder/
|
| 166 |
target/
|
| 167 |
|
| 168 |
# Jupyter Notebook
|
| 169 |
.ipynb_checkpoints
|
| 170 |
|
| 171 |
-
# IPython
|
| 172 |
-
profile_default/
|
| 173 |
-
ipython_config.py
|
| 174 |
-
|
| 175 |
# pyenv
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
#
|
| 179 |
-
|
| 180 |
-
# pipenv
|
| 181 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 182 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 183 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 184 |
-
# install all needed dependencies.
|
| 185 |
-
#Pipfile.lock
|
| 186 |
-
|
| 187 |
-
# poetry
|
| 188 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 189 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 190 |
-
# commonly ignored for libraries.
|
| 191 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 192 |
-
#poetry.lock
|
| 193 |
-
|
| 194 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 195 |
-
__pypackages__/
|
| 196 |
-
|
| 197 |
-
# Celery stuff
|
| 198 |
celerybeat-schedule
|
| 199 |
-
celerybeat.pid
|
| 200 |
|
| 201 |
# SageMath parsed files
|
| 202 |
*.sage.py
|
|
@@ -222,21 +142,82 @@ venv.bak/
|
|
| 222 |
|
| 223 |
# mypy
|
| 224 |
.mypy_cache/
|
| 225 |
-
.
|
| 226 |
-
dmypy.json
|
| 227 |
|
| 228 |
-
#
|
| 229 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
-
#
|
| 232 |
-
.
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
|
| 239 |
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 240 |
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 241 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 242 |
-
#.idea/
|
|
|
|
| 1 |
+
run-backend.ps
|
| 2 |
+
.eslintrc.js
|
| 3 |
+
.venv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# ignore compiled styles
|
| 6 |
+
*.css
|
| 7 |
|
| 8 |
+
# dependencies
|
| 9 |
+
**/node_modules/
|
| 10 |
+
**/.pnp
|
| 11 |
+
*.pnp.js
|
|
|
|
| 12 |
|
| 13 |
+
# testing
|
| 14 |
+
/coverage
|
| 15 |
|
| 16 |
+
# VSCode
|
| 17 |
+
**/.vscode/
|
| 18 |
|
| 19 |
+
# production
|
| 20 |
+
**/build/
|
| 21 |
|
| 22 |
+
# misc
|
| 23 |
+
.DS_Store
|
| 24 |
+
.env.local
|
| 25 |
+
.env.development.local
|
| 26 |
+
.env.test.local
|
| 27 |
+
.env.production.local
|
| 28 |
|
| 29 |
+
npm-debug.log*
|
| 30 |
+
yarn-debug.log*
|
| 31 |
+
yarn-error.log*
|
|
|
|
|
|
|
| 32 |
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# python
|
| 35 |
+
data/
|
| 36 |
+
Pipfile*
|
| 37 |
|
| 38 |
+
# .idea (JetBrains)
|
| 39 |
+
**/.idea/
|
| 40 |
|
| 41 |
# Byte-compiled / optimized / DLL files
|
| 42 |
__pycache__/
|
|
|
|
| 60 |
sdist/
|
| 61 |
var/
|
| 62 |
wheels/
|
|
|
|
| 63 |
*.egg-info/
|
| 64 |
.installed.cfg
|
| 65 |
*.egg
|
|
|
|
| 78 |
# Unit test / coverage reports
|
| 79 |
htmlcov/
|
| 80 |
.tox/
|
|
|
|
| 81 |
.coverage
|
| 82 |
.coverage.*
|
| 83 |
.cache
|
| 84 |
nosetests.xml
|
| 85 |
coverage.xml
|
| 86 |
*.cover
|
|
|
|
| 87 |
.hypothesis/
|
| 88 |
.pytest_cache/
|
|
|
|
| 89 |
|
| 90 |
# Translations
|
| 91 |
*.mo
|
|
|
|
| 95 |
*.log
|
| 96 |
local_settings.py
|
| 97 |
db.sqlite3
|
|
|
|
| 98 |
|
| 99 |
# Flask stuff:
|
| 100 |
instance/
|
|
|
|
| 107 |
docs/_build/
|
| 108 |
|
| 109 |
# PyBuilder
|
|
|
|
| 110 |
target/
|
| 111 |
|
| 112 |
# Jupyter Notebook
|
| 113 |
.ipynb_checkpoints
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
# pyenv
|
| 116 |
+
.python-version
|
| 117 |
+
|
| 118 |
+
# celery beat schedule file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
celerybeat-schedule
|
|
|
|
| 120 |
|
| 121 |
# SageMath parsed files
|
| 122 |
*.sage.py
|
|
|
|
| 142 |
|
| 143 |
# mypy
|
| 144 |
.mypy_cache/
|
| 145 |
+
test.py
|
|
|
|
| 146 |
|
| 147 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
| 148 |
+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
| 149 |
+
|
| 150 |
+
# User-specific stuff
|
| 151 |
+
.idea/**/workspace.xml
|
| 152 |
+
.idea/**/tasks.xml
|
| 153 |
+
.idea/**/usage.statistics.xml
|
| 154 |
+
.idea/**/dictionaries
|
| 155 |
+
.idea/**/shelf
|
| 156 |
+
|
| 157 |
+
# AWS User-specific
|
| 158 |
+
.idea/**/aws.xml
|
| 159 |
+
|
| 160 |
+
# Generated files
|
| 161 |
+
.idea/**/contentModel.xml
|
| 162 |
+
|
| 163 |
+
# Sensitive or high-churn files
|
| 164 |
+
.idea/**/dataSources/
|
| 165 |
+
.idea/**/dataSources.ids
|
| 166 |
+
.idea/**/dataSources.local.xml
|
| 167 |
+
.idea/**/sqlDataSources.xml
|
| 168 |
+
.idea/**/dynamic.xml
|
| 169 |
+
.idea/**/uiDesigner.xml
|
| 170 |
+
.idea/**/dbnavigator.xml
|
| 171 |
+
|
| 172 |
+
# Gradle
|
| 173 |
+
.idea/**/gradle.xml
|
| 174 |
+
.idea/**/libraries
|
| 175 |
+
|
| 176 |
+
# Gradle and Maven with auto-import
|
| 177 |
+
# When using Gradle or Maven with auto-import, you should exclude module files,
|
| 178 |
+
# since they will be recreated, and may cause churn. Uncomment if using
|
| 179 |
+
# auto-import.
|
| 180 |
+
# .idea/artifacts
|
| 181 |
+
# .idea/compiler.xml
|
| 182 |
+
# .idea/jarRepositories.xml
|
| 183 |
+
# .idea/modules.xml
|
| 184 |
+
# .idea/*.iml
|
| 185 |
+
# .idea/modules
|
| 186 |
+
# *.iml
|
| 187 |
+
# *.ipr
|
| 188 |
+
|
| 189 |
+
# CMake
|
| 190 |
+
cmake-build-*/
|
| 191 |
+
|
| 192 |
+
# Mongo Explorer plugin
|
| 193 |
+
.idea/**/mongoSettings.xml
|
| 194 |
+
|
| 195 |
+
# File-based project format
|
| 196 |
+
*.iws
|
| 197 |
+
|
| 198 |
+
# IntelliJ
|
| 199 |
+
out/
|
| 200 |
+
|
| 201 |
+
# mpeltonen/sbt-idea plugin
|
| 202 |
+
.idea_modules/
|
| 203 |
+
|
| 204 |
+
# JIRA plugin
|
| 205 |
+
atlassian-ide-plugin.xml
|
| 206 |
+
|
| 207 |
+
# Cursive Clojure plugin
|
| 208 |
+
.idea/replstate.xml
|
| 209 |
+
|
| 210 |
+
# SonarLint plugin
|
| 211 |
+
.idea/sonarlint/
|
| 212 |
|
| 213 |
+
# Crashlytics plugin (for Android Studio and IntelliJ)
|
| 214 |
+
com_crashlytics_export_strings.xml
|
| 215 |
+
crashlytics.properties
|
| 216 |
+
crashlytics-build.properties``
|
| 217 |
+
fabric.properties
|
| 218 |
|
| 219 |
+
# Editor-based Rest Client
|
| 220 |
+
.idea/httpRequests
|
| 221 |
|
| 222 |
+
# Android studio 3.1+ serialized cache file
|
| 223 |
+
.idea/caches/build_file_checksums.ser
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -2,12 +2,13 @@ import json
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from pathlib import Path
|
| 4 |
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
import gradio as gr
|
|
|
|
|
|
|
| 7 |
|
| 8 |
from content import *
|
| 9 |
-
from css import *
|
| 10 |
-
import glob
|
| 11 |
|
| 12 |
ARC = "arc"
|
| 13 |
HELLASWAG = "hellaswag"
|
|
@@ -17,51 +18,17 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
|
| 17 |
|
| 18 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
"
|
| 27 |
-
"de": "German",
|
| 28 |
-
"es": "Spanish",
|
| 29 |
-
"eu": "Basque",
|
| 30 |
-
"fr": "French",
|
| 31 |
-
"gu": "Gujarati",
|
| 32 |
-
"hi": "Hindi",
|
| 33 |
-
"hr": "Croatian",
|
| 34 |
-
"hu": "Hungarian",
|
| 35 |
-
"hy": "Armenian",
|
| 36 |
-
"id": "Indonesian",
|
| 37 |
-
"it": "Italian",
|
| 38 |
-
"kn": "Kannada",
|
| 39 |
-
"ml": "Malayalam",
|
| 40 |
-
"mr": "Marathi",
|
| 41 |
-
"ne": "Nepali",
|
| 42 |
-
"nl": "Dutch",
|
| 43 |
-
"pt": "Portuguese",
|
| 44 |
-
"ro": "Romanian",
|
| 45 |
-
"ru": "Russian",
|
| 46 |
-
"sk": "Slovak",
|
| 47 |
-
"sr": "Serbian",
|
| 48 |
-
"sv": "Swedish",
|
| 49 |
-
"ta": "Tamil",
|
| 50 |
-
"te": "Telugu",
|
| 51 |
-
"uk": "Ukrainian",
|
| 52 |
-
"vi": "Vietnamese",
|
| 53 |
-
"zh": "Chinese",
|
| 54 |
-
}
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def collect_results():
|
| 58 |
performance_dict = defaultdict(dict)
|
| 59 |
-
pretrained_models = set()
|
| 60 |
for pfin in Path("evals").rglob("*.json"):
|
| 61 |
data = json.loads(pfin.read_text(encoding="utf-8"))
|
| 62 |
-
if "results" not in data:
|
| 63 |
-
continue
|
| 64 |
-
if "config" not in data:
|
| 65 |
continue
|
| 66 |
results = data["results"]
|
| 67 |
config = data["config"]
|
|
@@ -74,7 +41,6 @@ def collect_results():
|
|
| 74 |
continue
|
| 75 |
pretrained = pretrained[0].split("=")[1]
|
| 76 |
pretrained = pretrained.split("/")[-1]
|
| 77 |
-
pretrained_models.add(pretrained)
|
| 78 |
|
| 79 |
for lang_task, perfs in results.items():
|
| 80 |
task, lang = lang_task.split("_")
|
|
@@ -85,33 +51,46 @@ def collect_results():
|
|
| 85 |
p = round(perfs[metric] * 100, 1)
|
| 86 |
performance_dict[(pretrained, lang)][task] = p
|
| 87 |
|
| 88 |
-
return performance_dict
|
| 89 |
|
| 90 |
|
| 91 |
-
def
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
for (pretrained, lang), perfs in performance_dict.items():
|
| 94 |
-
lang_name = LANG_NAME[lang]
|
| 95 |
arc_perf = perfs.get(ARC, 0.0)
|
| 96 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
| 97 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
| 98 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
| 99 |
|
| 100 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
| 101 |
-
|
| 102 |
-
row
|
| 103 |
-
df.append(row)
|
| 104 |
|
| 105 |
-
df = pd.DataFrame.from_records(
|
| 106 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
| 107 |
-
df = df[COLS]
|
| 108 |
-
|
| 109 |
return df
|
| 110 |
|
| 111 |
|
| 112 |
-
def
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
MODEL_COL = "Model"
|
|
@@ -120,43 +99,31 @@ ARC_COL = "ARC (25-shot)"
|
|
| 120 |
HELLASWAG_COL = "HellaSwag (10-shot)οΈ"
|
| 121 |
MMLU_COL = "MMLU (5-shot)"
|
| 122 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
| 123 |
-
NOTES_COL = "Notes" # For search only
|
| 124 |
-
|
| 125 |
-
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
|
| 126 |
-
TYPES = ["str", "number", "number", "number", "number", "number", "str"]
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
| 133 |
gr.HTML(TITLE)
|
| 134 |
-
gr.Markdown(INTRO_TEXT
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
-
# # Dummy leaderboard for handling the case when the user uses backspace key
|
| 149 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 150 |
-
value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
|
| 151 |
-
)
|
| 152 |
-
|
| 153 |
-
search_bar.change(
|
| 154 |
-
search_table,
|
| 155 |
-
[hidden_leaderboard_table_for_search, search_bar],
|
| 156 |
-
leaderboard_table,
|
| 157 |
-
)
|
| 158 |
|
| 159 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
| 160 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
| 161 |
|
| 162 |
-
|
|
|
|
|
|
|
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
+
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
import gradio as gr
|
| 8 |
+
from pandas import DataFrame
|
| 9 |
+
from pandas.io.formats.style import Styler
|
| 10 |
|
| 11 |
from content import *
|
|
|
|
|
|
|
| 12 |
|
| 13 |
ARC = "arc"
|
| 14 |
HELLASWAG = "hellaswag"
|
|
|
|
| 18 |
|
| 19 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
|
| 23 |
+
"""
|
| 24 |
+
Collects results from the evals folder and returns a dictionary of results
|
| 25 |
+
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
| 26 |
+
dictionaries of the form {benchmark_name: performance_score}
|
| 27 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
performance_dict = defaultdict(dict)
|
|
|
|
| 29 |
for pfin in Path("evals").rglob("*.json"):
|
| 30 |
data = json.loads(pfin.read_text(encoding="utf-8"))
|
| 31 |
+
if "results" not in data or "config" not in data:
|
|
|
|
|
|
|
| 32 |
continue
|
| 33 |
results = data["results"]
|
| 34 |
config = data["config"]
|
|
|
|
| 41 |
continue
|
| 42 |
pretrained = pretrained[0].split("=")[1]
|
| 43 |
pretrained = pretrained.split("/")[-1]
|
|
|
|
| 44 |
|
| 45 |
for lang_task, perfs in results.items():
|
| 46 |
task, lang = lang_task.split("_")
|
|
|
|
| 51 |
p = round(perfs[metric] * 100, 1)
|
| 52 |
performance_dict[(pretrained, lang)][task] = p
|
| 53 |
|
| 54 |
+
return dict(performance_dict)
|
| 55 |
|
| 56 |
|
| 57 |
+
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
|
| 58 |
+
"""
|
| 59 |
+
Builds a dataframe from the performance dictionary
|
| 60 |
+
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
| 61 |
+
dictionaries of the form {benchmark_name: performance_score}
|
| 62 |
+
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
| 63 |
+
"""
|
| 64 |
+
data = []
|
| 65 |
for (pretrained, lang), perfs in performance_dict.items():
|
|
|
|
| 66 |
arc_perf = perfs.get(ARC, 0.0)
|
| 67 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
| 68 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
| 69 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
| 70 |
|
| 71 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
| 72 |
+
row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
| 73 |
+
data.append(row)
|
|
|
|
| 74 |
|
| 75 |
+
df = pd.DataFrame.from_records(data, columns=COLS)
|
| 76 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
|
|
|
|
|
|
| 77 |
return df
|
| 78 |
|
| 79 |
|
| 80 |
+
def style_df(df: DataFrame) -> Styler:
|
| 81 |
+
"""
|
| 82 |
+
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
|
| 83 |
+
:param df: the dataframe to style
|
| 84 |
+
:return: the Styler
|
| 85 |
+
"""
|
| 86 |
+
styler = df.style.format("{:.2f}", subset=df.columns[1:])
|
| 87 |
+
|
| 88 |
+
def highlight_max(col):
|
| 89 |
+
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
| 90 |
+
|
| 91 |
+
styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
|
| 92 |
+
|
| 93 |
+
return styler
|
| 94 |
|
| 95 |
|
| 96 |
MODEL_COL = "Model"
|
|
|
|
| 99 |
HELLASWAG_COL = "HellaSwag (10-shot)οΈ"
|
| 100 |
MMLU_COL = "MMLU (5-shot)"
|
| 101 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
| 104 |
+
TYPES = ["str", "number", "number", "number", "number", "number"]
|
| 105 |
|
| 106 |
+
results = collect_results()
|
| 107 |
+
original_df = build_performance_df(results)
|
| 108 |
+
styled_df = style_df(original_df)
|
| 109 |
+
with gr.Blocks() as demo:
|
| 110 |
gr.HTML(TITLE)
|
| 111 |
+
gr.Markdown(INTRO_TEXT)
|
| 112 |
+
|
| 113 |
+
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
|
| 114 |
+
gr.components.Dataframe(
|
| 115 |
+
value=original_df,
|
| 116 |
+
headers=COLS,
|
| 117 |
+
datatype=TYPES,
|
| 118 |
+
elem_id="leaderboard-table",
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
gr.Markdown("## LaTeX")
|
| 122 |
+
gr.Code(styled_df.to_latex(convert_css=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
| 125 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
| 126 |
|
| 127 |
+
if __name__ == '__main__':
|
| 128 |
+
demo.launch()
|
| 129 |
+
|
content.py
CHANGED
|
@@ -1,44 +1,29 @@
|
|
| 1 |
-
TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard</h1>'
|
| 2 |
|
| 3 |
INTRO_TEXT = f"""
|
| 4 |
## About
|
| 5 |
|
| 6 |
-
This
|
| 7 |
-
|
| 8 |
-
Our current leaderboard provides evaluation data for 29 languages, i.e.,
|
| 9 |
-
Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch,
|
| 10 |
-
French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam,
|
| 11 |
-
Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish,
|
| 12 |
-
Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way.
|
| 13 |
-
Both multilingual and language-specific LLMs are welcome in this leaderboard.
|
| 14 |
-
We currently evaluate models over four benchmarks:
|
| 15 |
|
| 16 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
|
| 17 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
|
| 18 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
|
| 19 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
|
| 20 |
|
| 21 |
-
|
| 22 |
|
| 23 |
"""
|
| 24 |
|
| 25 |
-
HOW_TO = f"""
|
| 26 |
-
## How to list your model performance on this leaderboard:
|
| 27 |
-
|
| 28 |
-
Run the evaluation of your model using this repo: <a href="https://github.com/laiviet/lm-evaluation-harness" target="_blank">https://github.com/laiviet/lm-evaluation-harness</a>.
|
| 29 |
-
|
| 30 |
-
And then, push the evaluation log and make a pull request.
|
| 31 |
-
"""
|
| 32 |
-
|
| 33 |
CREDIT = f"""
|
| 34 |
## Credit
|
| 35 |
|
| 36 |
-
|
| 37 |
|
| 38 |
- Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
|
| 39 |
-
- Funding and GPU access (Adobe Research)
|
| 40 |
- Evaluation code (EleutherAI's lm_evaluation_harness repo)
|
| 41 |
- Leaderboard code (Huggingface4's open_llm_leaderboard repo)
|
|
|
|
| 42 |
|
| 43 |
"""
|
| 44 |
|
|
@@ -46,12 +31,19 @@ To make this website, we use the following resources:
|
|
| 46 |
CITATION = f"""
|
| 47 |
## Citation
|
| 48 |
|
| 49 |
-
```
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
@misc{{lai2023openllmbenchmark,
|
| 52 |
author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
|
| 53 |
title={{Open Multilingual LLM Evaluation Leaderboard}},
|
| 54 |
year={{2023}}
|
| 55 |
}}
|
| 56 |
```
|
| 57 |
-
"""
|
|
|
|
| 1 |
+
TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard (Dutch only)</h1>'
|
| 2 |
|
| 3 |
INTRO_TEXT = f"""
|
| 4 |
## About
|
| 5 |
|
| 6 |
+
This is a fork of the [Open Multilingual LLM Evaluation Leaderboard](https://huggingface.co/spaces/uonlp/open_multilingual_llm_leaderboard), but restricted to only Dutch models and augmented with additional model results.
|
| 7 |
+
We test the models on the following benchmarks **for the Dutch version only!!**, which have been translated into Dutch automatically by the original authors of the Open Multilingual LLM Evaluation Leaderboard with `gpt-35-turbo`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
|
| 10 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
|
| 11 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
|
| 12 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
|
| 13 |
|
| 14 |
+
I do not maintain those datasets, I only run benchmarks and add the results to this space. For questions regarding the test sets or running them yourself, see [the original Github repository](https://github.com/laiviet/lm-evaluation-harness).
|
| 15 |
|
| 16 |
"""
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
CREDIT = f"""
|
| 19 |
## Credit
|
| 20 |
|
| 21 |
+
This leaderboard has borrowed heavily from the following sources:
|
| 22 |
|
| 23 |
- Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
|
|
|
|
| 24 |
- Evaluation code (EleutherAI's lm_evaluation_harness repo)
|
| 25 |
- Leaderboard code (Huggingface4's open_llm_leaderboard repo)
|
| 26 |
+
- The multilingual version of the leaderboard (uonlp's open_multilingual_llm_leaderboard repo)
|
| 27 |
|
| 28 |
"""
|
| 29 |
|
|
|
|
| 31 |
CITATION = f"""
|
| 32 |
## Citation
|
| 33 |
|
|
|
|
| 34 |
|
| 35 |
+
If you use or cite the Dutch benchmark results or this specific leaderboard page, please cite the following paper:
|
| 36 |
+
|
| 37 |
+
TDB
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
If you use the multilingual benchmarks, please cite the following paper:
|
| 41 |
+
|
| 42 |
+
```bibtex
|
| 43 |
@misc{{lai2023openllmbenchmark,
|
| 44 |
author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
|
| 45 |
title={{Open Multilingual LLM Evaluation Leaderboard}},
|
| 46 |
year={{2023}}
|
| 47 |
}}
|
| 48 |
```
|
| 49 |
+
"""
|
css.py
DELETED
|
@@ -1,13 +0,0 @@
|
|
| 1 |
-
CUSTOM_CSS = """
|
| 2 |
-
/* Hides the final column */
|
| 3 |
-
table td:last-child,
|
| 4 |
-
table th:last-child {
|
| 5 |
-
display: none;
|
| 6 |
-
}
|
| 7 |
-
# table td:first-child,
|
| 8 |
-
# table th:first-child {
|
| 9 |
-
# max-width: 400px;
|
| 10 |
-
# overflow: auto;
|
| 11 |
-
# white-space: nowrap;
|
| 12 |
-
# }
|
| 13 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_nl_Llama-2-7b-chat-hf.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
+
"acc": 0.3550042771599658,
|
| 5 |
+
"acc_stderr": 0.014001474982174305,
|
| 6 |
+
"acc_norm": 0.3609923011120616,
|
| 7 |
+
"acc_norm_stderr": 0.014053373664144789
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/arc/arc_nl_Llama-2-7b-hf.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
+
"acc": 0.33447390932420873,
|
| 5 |
+
"acc_stderr": 0.013805185437125271,
|
| 6 |
+
"acc_norm": 0.3558597091531223,
|
| 7 |
+
"acc_norm_stderr": 0.014009035017396714
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/arc/{arc_nl_Mistral-7B-v0.1.json β arc_nl_Orca-2-7b.json}
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
+
"acc": 0.3661248930710009,
|
| 5 |
+
"acc_stderr": 0.014095972894279241,
|
| 6 |
+
"acc_norm": 0.3678357570573139,
|
| 7 |
+
"acc_norm_stderr": 0.014109788842173
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json β arc/arc_nl_gpt2-large-dutch.json}
RENAMED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
-
"
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
| 11 |
-
"
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
+
"arc_nl": {
|
| 4 |
+
"acc": 0.20102651839178784,
|
| 5 |
+
"acc_stderr": 0.011726581781869408,
|
| 6 |
+
"acc_norm": 0.24037639007698888,
|
| 7 |
+
"acc_norm_stderr": 0.01250327289928353
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
| 11 |
+
"arc_nl": 0
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/arc/arc_nl_gpt2-medium-dutch.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_nl": {
|
| 4 |
+
"acc": 0.21471343028229256,
|
| 5 |
+
"acc_stderr": 0.012014958326088981,
|
| 6 |
+
"acc_norm": 0.24294268605645852,
|
| 7 |
+
"acc_norm_stderr": 0.012548588352773891
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_nl": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc/arc_nl_zephyr-7b-beta.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"arc_nl": {
|
| 4 |
+
"acc": 0.4311377245508982,
|
| 5 |
+
"acc_stderr": 0.014490726457652989,
|
| 6 |
+
"acc_norm": 0.43199315654405473,
|
| 7 |
+
"acc_norm_stderr": 0.014494184864971338
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.3838100377765785,
|
| 5 |
+
"acc_stderr": 0.005052614927289456,
|
| 6 |
+
"acc_norm": 0.4819212088505127,
|
| 7 |
+
"acc_norm_stderr": 0.005191425828002782
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.386184565569347,
|
| 5 |
+
"acc_stderr": 0.00505844561828187,
|
| 6 |
+
"acc_norm": 0.4957366432811657,
|
| 7 |
+
"acc_norm_stderr": 0.0051946338704556266
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.4336751214247167,
|
| 5 |
+
"acc_stderr": 0.0051489159372014965,
|
| 6 |
+
"acc_norm": 0.5662169454937939,
|
| 7 |
+
"acc_norm_stderr": 0.005149065890785751
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json β hellaswag_nl_Orca-2-7b.json}
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
-
"acc": 0.
|
| 5 |
-
"acc_stderr": 0.
|
| 6 |
-
"acc_norm": 0.
|
| 7 |
-
"acc_norm_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.38456556934700487,
|
| 5 |
+
"acc_stderr": 0.005054483938257531,
|
| 6 |
+
"acc_norm": 0.48041014570966,
|
| 7 |
+
"acc_norm_stderr": 0.005190834031799853
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.3043712898003238,
|
| 5 |
+
"acc_stderr": 0.004780698091128437,
|
| 6 |
+
"acc_norm": 0.34279546681057743,
|
| 7 |
+
"acc_norm_stderr": 0.004931380767300367
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"hellaswag_nl": 1
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.31246627091203455,
|
| 5 |
+
"acc_stderr": 0.004815587775923881,
|
| 6 |
+
"acc_norm": 0.36438208310847275,
|
| 7 |
+
"acc_norm_stderr": 0.00500008398696681
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"hellaswag_nl": 1
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hellaswag_nl": {
|
| 4 |
+
"acc": 0.44069077172153265,
|
| 5 |
+
"acc_stderr": 0.0051581467942195215,
|
| 6 |
+
"acc_norm": 0.5429033998920669,
|
| 7 |
+
"acc_norm_stderr": 0.005175663147811796
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"hellaswag_nl": 1
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=Intel/neural-chat-7b-v3-1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json β mmlu/mmlu_nl_Mistral-7B-v0.1.json}
RENAMED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
-
"
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
| 11 |
-
"
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
+
"mmlu_nl": {
|
| 4 |
+
"acc": 0.45974045685664416,
|
| 5 |
+
"acc_stderr": 0.004341759787221058,
|
| 6 |
+
"acc_norm": 0.36912802610609396,
|
| 7 |
+
"acc_norm_stderr": 0.0042040447899996366
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
| 11 |
+
"mmlu_nl": 0
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/mmlu/mmlu_nl_gpt2-large-dutch.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"mmlu_nl": {
|
| 4 |
+
"acc": 0.2301737876603172,
|
| 5 |
+
"acc_stderr": 0.003667182186959482,
|
| 6 |
+
"acc_norm": 0.2436821734841011,
|
| 7 |
+
"acc_norm_stderr": 0.0037400056232706905
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"mmlu_nl": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/mmlu/mmlu_nl_gpt2-medium-dutch.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"mmlu_nl": {
|
| 4 |
+
"acc": 0.23343704940426502,
|
| 5 |
+
"acc_stderr": 0.0036852504856799066,
|
| 6 |
+
"acc_norm": 0.2483873415800258,
|
| 7 |
+
"acc_norm_stderr": 0.003764176503735655
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"mmlu_nl": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.2764331210191083,
|
| 5 |
-
"mc1_stderr": 0.01597262688062874,
|
| 6 |
-
"mc2": 0.4103755310313891,
|
| 7 |
-
"mc2_stderr": 0.014811313488625848
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"truthfulqa_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size": 8,
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.
|
| 5 |
-
"mc1_stderr": 0.
|
| 6 |
-
"mc2": 0.
|
| 7 |
-
"mc2_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
+
"mc1": 0.289171974522293,
|
| 5 |
+
"mc1_stderr": 0.016192068781346693,
|
| 6 |
+
"mc2": 0.4445882138885173,
|
| 7 |
+
"mc2_stderr": 0.016144169053565395
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
"mc1": 0.28152866242038216,
|
| 5 |
"mc1_stderr": 0.016062309899461683,
|
| 6 |
-
"mc2": 0.
|
| 7 |
-
"mc2_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
"mc1": 0.28152866242038216,
|
| 5 |
"mc1_stderr": 0.016062309899461683,
|
| 6 |
+
"mc2": 0.41449853431238814,
|
| 7 |
+
"mc2_stderr": 0.014922005996963188
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json β truthfulqa_nl_Orca-2-7b.json}
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.
|
| 5 |
-
"mc1_stderr": 0.
|
| 6 |
-
"mc2": 0.
|
| 7 |
-
"mc2_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
+
"mc1": 0.3146496815286624,
|
| 5 |
+
"mc1_stderr": 0.01658486445168711,
|
| 6 |
+
"mc2": 0.4488463711895695,
|
| 7 |
+
"mc2_stderr": 0.016292493035951996
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.310828025477707,
|
| 5 |
-
"mc1_stderr": 0.016529733724696277,
|
| 6 |
-
"mc2": 0.4460845208916539,
|
| 7 |
-
"mc2_stderr": 0.01476856418537487
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"truthfulqa_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size": 8,
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/truthfulqa_nl_falcon-40b.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.2764331210191083,
|
| 5 |
-
"mc1_stderr": 0.01597262688062875,
|
| 6 |
-
"mc2": 0.4091336161450544,
|
| 7 |
-
"mc2_stderr": 0.014605140809282338
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"truthfulqa_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size": 8,
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json β truthfulqa_nl_gpt2-large-dutch.json}
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.
|
| 5 |
-
"mc1_stderr": 0.
|
| 6 |
-
"mc2": 0.
|
| 7 |
-
"mc2_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
+
"mc1": 0.25987261146496815,
|
| 5 |
+
"mc1_stderr": 0.015663018533664023,
|
| 6 |
+
"mc2": 0.41961324970531233,
|
| 7 |
+
"mc2_stderr": 0.01509691194885121
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/truthfulqa/{truthfulqa_nl-falcon-40b.json β truthfulqa_nl_gpt2-medium-dutch.json}
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.
|
| 5 |
-
"mc1_stderr": 0.
|
| 6 |
-
"mc2": 0.
|
| 7 |
-
"mc2_stderr": 0.
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
@@ -12,8 +12,8 @@
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=
|
| 16 |
-
"batch_size":
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
|
|
|
| 1 |
{
|
| 2 |
"results": {
|
| 3 |
"truthfulqa_nl": {
|
| 4 |
+
"mc1": 0.2878980891719745,
|
| 5 |
+
"mc1_stderr": 0.0161708346142461,
|
| 6 |
+
"mc2": 0.4527386932512769,
|
| 7 |
+
"mc2_stderr": 0.015417954968769677
|
| 8 |
}
|
| 9 |
},
|
| 10 |
"versions": {
|
|
|
|
| 12 |
},
|
| 13 |
"config": {
|
| 14 |
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
+
"batch_size": "auto",
|
| 17 |
"device": "cuda",
|
| 18 |
"no_cache": false,
|
| 19 |
"limit": null,
|
evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.2751592356687898,
|
| 5 |
-
"mc1_stderr": 0.0159498029022655,
|
| 6 |
-
"mc2": 0.41816127879466414,
|
| 7 |
-
"mc2_stderr": 0.01474120131034505
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"truthfulqa_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size": 8,
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"truthfulqa_nl": {
|
| 4 |
-
"mc1": 0.3719745222929936,
|
| 5 |
-
"mc1_stderr": 0.0172618443903749,
|
| 6 |
-
"mc2": 0.5294532108691418,
|
| 7 |
-
"mc2_stderr": 0.016221848481192833
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"truthfulqa_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
| 16 |
-
"batch_size": 64,
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|