Spaces:
Runtime error
Runtime error
Commit
·
781a150
1
Parent(s):
863f952
making the submits + removing internal / external
Browse files- app.py +40 -7
- src/envs.py +2 -0
- src/gen/gen_answer.py +2 -1
- src/gen/gen_judgment.py +5 -10
- src/gen/show_result.py +2 -2
- src/leaderboard/build_leaderboard.py +9 -8
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
import subprocess
|
|
@@ -18,6 +19,7 @@ from src.display.utils import (
|
|
| 18 |
)
|
| 19 |
from src.envs import (
|
| 20 |
API,
|
|
|
|
| 21 |
H4_TOKEN,
|
| 22 |
HF_HOME,
|
| 23 |
HF_TOKEN_PRIVATE,
|
|
@@ -26,8 +28,9 @@ from src.envs import (
|
|
| 26 |
PERSISTENT_FILE_CHECK_PATH,
|
| 27 |
REPO_ID,
|
| 28 |
RESET_JUDGEMENT_ENV,
|
|
|
|
| 29 |
)
|
| 30 |
-
from src.leaderboard.build_leaderboard import build_leadearboard_df,
|
| 31 |
|
| 32 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
|
| 33 |
|
|
@@ -37,7 +40,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
|
|
| 37 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 38 |
enable_space_ci()
|
| 39 |
|
| 40 |
-
|
| 41 |
|
| 42 |
|
| 43 |
def build_demo():
|
|
@@ -75,15 +78,45 @@ def build_demo():
|
|
| 75 |
|
| 76 |
with gr.Column():
|
| 77 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 78 |
-
submitter_username = gr.Textbox(label="Username")
|
| 79 |
|
| 80 |
def upload_file(file):
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
API.upload_file(
|
| 84 |
path_or_fileobj=file.name,
|
| 85 |
-
path_in_repo="model_answers/
|
| 86 |
-
repo_id=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
repo_type="dataset",
|
| 88 |
token=HF_TOKEN_PRIVATE,
|
| 89 |
)
|
|
|
|
| 1 |
+
import json
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
import subprocess
|
|
|
|
| 19 |
)
|
| 20 |
from src.envs import (
|
| 21 |
API,
|
| 22 |
+
DATA_PATH,
|
| 23 |
H4_TOKEN,
|
| 24 |
HF_HOME,
|
| 25 |
HF_TOKEN_PRIVATE,
|
|
|
|
| 28 |
PERSISTENT_FILE_CHECK_PATH,
|
| 29 |
REPO_ID,
|
| 30 |
RESET_JUDGEMENT_ENV,
|
| 31 |
+
SUBMITS_META_FILE,
|
| 32 |
)
|
| 33 |
+
from src.leaderboard.build_leaderboard import build_leadearboard_df, download_meta
|
| 34 |
|
| 35 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
|
| 36 |
|
|
|
|
| 40 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 41 |
enable_space_ci()
|
| 42 |
|
| 43 |
+
download_meta()
|
| 44 |
|
| 45 |
|
| 46 |
def build_demo():
|
|
|
|
| 78 |
|
| 79 |
with gr.Column():
|
| 80 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 81 |
+
submitter_username = gr.Textbox(label="Username") # can we get this info from hf??
|
| 82 |
|
| 83 |
def upload_file(file):
|
| 84 |
+
file_name = file.name.split("/")[-1] if "/" in file.name else file.name
|
| 85 |
+
|
| 86 |
+
with open(f"{DATA_PATH}/{SUBMITS_META_FILE}", "r", encoding="utf-8") as submit_meta_file:
|
| 87 |
+
current_info = json.loads(submit_meta_file)
|
| 88 |
+
|
| 89 |
+
# for now just do not save same name model
|
| 90 |
+
if model_name_textbox in current_info:
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
submit_info = {
|
| 94 |
+
"username": submitter_username,
|
| 95 |
+
"file_name": file_name,
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
current_info[model_name_textbox] = submit_info
|
| 99 |
+
|
| 100 |
+
with open(f"{DATA_PATH}/{SUBMITS_META_FILE}", "w", encoding="utf-8") as submit_meta_file:
|
| 101 |
+
submit_meta_file.write(json.dumps(current_info))
|
| 102 |
+
|
| 103 |
+
logging.info(
|
| 104 |
+
"New submition: file from %s saved to %s with model %s",
|
| 105 |
+
submitter_username,
|
| 106 |
+
file_name,
|
| 107 |
+
model_name_textbox,
|
| 108 |
+
)
|
| 109 |
API.upload_file(
|
| 110 |
path_or_fileobj=file.name,
|
| 111 |
+
path_in_repo="arena-hard-v0.1/model_answers/" + file_name,
|
| 112 |
+
repo_id=METAINFO_DATASET,
|
| 113 |
+
repo_type="dataset",
|
| 114 |
+
token=HF_TOKEN_PRIVATE,
|
| 115 |
+
)
|
| 116 |
+
API.upload_file(
|
| 117 |
+
path_or_fileobj=SUBMITS_META_FILE,
|
| 118 |
+
path_in_repo=SUBMITS_META_FILE,
|
| 119 |
+
repo_id=METAINFO_DATASET,
|
| 120 |
repo_type="dataset",
|
| 121 |
token=HF_TOKEN_PRIVATE,
|
| 122 |
)
|
src/envs.py
CHANGED
|
@@ -35,6 +35,8 @@ RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
|
|
| 35 |
|
| 36 |
API = HfApi(token=H4_TOKEN)
|
| 37 |
|
|
|
|
|
|
|
| 38 |
PERSISTENT_FILE_CHECK = "persistent_file_check"
|
| 39 |
PERSISTENT_FILE_CHECK_PATH = f"{DATA_PATH}/{PERSISTENT_FILE_CHECK}"
|
| 40 |
|
|
|
|
| 35 |
|
| 36 |
API = HfApi(token=H4_TOKEN)
|
| 37 |
|
| 38 |
+
# if any more info about service pls make this file just a json
|
| 39 |
+
SUBMITS_META_FILE = "submits_info.json"
|
| 40 |
PERSISTENT_FILE_CHECK = "persistent_file_check"
|
| 41 |
PERSISTENT_FILE_CHECK_PATH = f"{DATA_PATH}/{PERSISTENT_FILE_CHECK}"
|
| 42 |
|
src/gen/gen_answer.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
Usage:
|
| 4 |
python gen_api_answer --parallel 32
|
| 5 |
"""
|
|
|
|
| 6 |
import argparse
|
| 7 |
import concurrent.futures
|
| 8 |
import json
|
|
@@ -138,7 +139,7 @@ if __name__ == "__main__":
|
|
| 138 |
settings = make_config(args.setting_file)
|
| 139 |
endpoint_list = make_config(args.endpoint_file)
|
| 140 |
|
| 141 |
-
existing_answer = load_model_answers(os.path.join("data", settings["bench_name"], "model_answers"
|
| 142 |
|
| 143 |
print(settings)
|
| 144 |
|
|
|
|
| 3 |
Usage:
|
| 4 |
python gen_api_answer --parallel 32
|
| 5 |
"""
|
| 6 |
+
|
| 7 |
import argparse
|
| 8 |
import concurrent.futures
|
| 9 |
import json
|
|
|
|
| 139 |
settings = make_config(args.setting_file)
|
| 140 |
endpoint_list = make_config(args.endpoint_file)
|
| 141 |
|
| 142 |
+
existing_answer = load_model_answers(os.path.join("data", settings["bench_name"], "model_answers"))
|
| 143 |
|
| 144 |
print(settings)
|
| 145 |
|
src/gen/gen_judgment.py
CHANGED
|
@@ -116,12 +116,12 @@ def judgment(**args):
|
|
| 116 |
result = {"user_prompt": conv[1]["content"], "judgment": judgment, "score": score}
|
| 117 |
output["games"].append(result)
|
| 118 |
|
| 119 |
-
with open(output_file, "a") as f:
|
| 120 |
f.write(json.dumps(output, ensure_ascii=False) + "\n")
|
| 121 |
huggingface_hub.HfApi().upload_file(
|
| 122 |
output_file,
|
| 123 |
path_in_repo=f'model_judgment/{configs["judge_model"]}/{output_file.split("/")[-1]}',
|
| 124 |
-
repo_id="Vikhrmodels
|
| 125 |
repo_type="dataset",
|
| 126 |
)
|
| 127 |
|
|
@@ -145,21 +145,16 @@ if __name__ == "__main__":
|
|
| 145 |
pattern = re.compile(configs["regex_pattern"])
|
| 146 |
|
| 147 |
question_file = os.path.join(f"{HF_HOME}/data", configs["bench_name"], "question.jsonl")
|
| 148 |
-
|
| 149 |
-
external_dir = os.path.join(f"{HF_HOME}/data", configs["bench_name"], "model_answers/external")
|
| 150 |
ref_answer_dir = os.path.join(f"{HF_HOME}/data", configs["bench_name"], "reference_answer")
|
| 151 |
|
| 152 |
questions = load_questions(question_file)
|
| 153 |
-
|
| 154 |
-
model_answers_internal = load_model_answers(internal_dir)
|
| 155 |
-
|
| 156 |
-
# internal has priority
|
| 157 |
-
model_answers = {**model_answers_external, **model_answers_internal}
|
| 158 |
|
| 159 |
# if user choose a set of models, only judge those models
|
| 160 |
models = [
|
| 161 |
model.split("/")[-1].split(".")[0]
|
| 162 |
-
for model in glob.glob(f"{HF_HOME}/data/arena-hard-v0.1/model_answers
|
| 163 |
]
|
| 164 |
|
| 165 |
ref_answers = None
|
|
|
|
| 116 |
result = {"user_prompt": conv[1]["content"], "judgment": judgment, "score": score}
|
| 117 |
output["games"].append(result)
|
| 118 |
|
| 119 |
+
with open(output_file, "a", encoding="utf-8") as f:
|
| 120 |
f.write(json.dumps(output, ensure_ascii=False) + "\n")
|
| 121 |
huggingface_hub.HfApi().upload_file(
|
| 122 |
output_file,
|
| 123 |
path_in_repo=f'model_judgment/{configs["judge_model"]}/{output_file.split("/")[-1]}',
|
| 124 |
+
repo_id="Vikhrmodels/-eval",
|
| 125 |
repo_type="dataset",
|
| 126 |
)
|
| 127 |
|
|
|
|
| 145 |
pattern = re.compile(configs["regex_pattern"])
|
| 146 |
|
| 147 |
question_file = os.path.join(f"{HF_HOME}/data", configs["bench_name"], "question.jsonl")
|
| 148 |
+
answers_dir = os.path.join(f"{HF_HOME}/data", configs["bench_name"], "model_answers")
|
|
|
|
| 149 |
ref_answer_dir = os.path.join(f"{HF_HOME}/data", configs["bench_name"], "reference_answer")
|
| 150 |
|
| 151 |
questions = load_questions(question_file)
|
| 152 |
+
model_answers = load_model_answers(answers_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# if user choose a set of models, only judge those models
|
| 155 |
models = [
|
| 156 |
model.split("/")[-1].split(".")[0]
|
| 157 |
+
for model in glob.glob(f"{HF_HOME}/data/arena-hard-v0.1/model_answers/*.jsonl")
|
| 158 |
]
|
| 159 |
|
| 160 |
ref_answers = None
|
src/gen/show_result.py
CHANGED
|
@@ -198,7 +198,7 @@ if __name__ == "__main__":
|
|
| 198 |
args.load_battles and args.load_bootstrap
|
| 199 |
), "If loading prexisting bootstrapping data, you must also load preexisting battles."
|
| 200 |
|
| 201 |
-
answer_dir = os.path.join("data", args.bench_name, "model_answers
|
| 202 |
model_answers = load_model_answers(answer_dir)
|
| 203 |
|
| 204 |
if args.load_battles:
|
|
@@ -265,7 +265,7 @@ if __name__ == "__main__":
|
|
| 265 |
huggingface_hub.HfApi().upload_file(
|
| 266 |
path_or_fileobj=json_file_name,
|
| 267 |
path_in_repo="leaderboard.json",
|
| 268 |
-
repo_id=
|
| 269 |
repo_type="dataset",
|
| 270 |
token=HF_TOKEN_PRIVATE,
|
| 271 |
)
|
|
|
|
| 198 |
args.load_battles and args.load_bootstrap
|
| 199 |
), "If loading prexisting bootstrapping data, you must also load preexisting battles."
|
| 200 |
|
| 201 |
+
answer_dir = os.path.join("data", args.bench_name, "model_answers")
|
| 202 |
model_answers = load_model_answers(answer_dir)
|
| 203 |
|
| 204 |
if args.load_battles:
|
|
|
|
| 265 |
huggingface_hub.HfApi().upload_file(
|
| 266 |
path_or_fileobj=json_file_name,
|
| 267 |
path_in_repo="leaderboard.json",
|
| 268 |
+
repo_id=METAINFO_DATASET,
|
| 269 |
repo_type="dataset",
|
| 270 |
token=HF_TOKEN_PRIVATE,
|
| 271 |
)
|
src/leaderboard/build_leaderboard.py
CHANGED
|
@@ -6,7 +6,7 @@ import time
|
|
| 6 |
import pandas as pd
|
| 7 |
from huggingface_hub import snapshot_download
|
| 8 |
|
| 9 |
-
from src.envs import
|
| 10 |
|
| 11 |
# Configure logging
|
| 12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
@@ -51,16 +51,17 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
| 51 |
logging.error("Failed to download %s after %s attempts", repo_id, max_attempts)
|
| 52 |
|
| 53 |
|
| 54 |
-
def
|
| 55 |
-
# download
|
|
|
|
|
|
|
| 56 |
download_dataset(METAINFO_DATASET, DATA_PATH)
|
| 57 |
|
| 58 |
-
# download answers of different models that we trust
|
| 59 |
-
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|
| 60 |
-
|
| 61 |
|
| 62 |
def build_leadearboard_df():
|
| 63 |
# Retrieve the leaderboard DataFrame
|
| 64 |
with open(f"{DATA_PATH}/leaderboard.json", "r", encoding="utf-8") as eval_file:
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from huggingface_hub import snapshot_download
|
| 8 |
|
| 9 |
+
from src.envs import DATA_PATH, HF_TOKEN_PRIVATE, METAINFO_DATASET, SUBMITS_META_FILE
|
| 10 |
|
| 11 |
# Configure logging
|
| 12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
| 51 |
logging.error("Failed to download %s after %s attempts", repo_id, max_attempts)
|
| 52 |
|
| 53 |
|
| 54 |
+
def download_meta():
|
| 55 |
+
# download all metainfo about submits
|
| 56 |
+
# also all the submits questions
|
| 57 |
+
# also all other stuff
|
| 58 |
download_dataset(METAINFO_DATASET, DATA_PATH)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def build_leadearboard_df():
|
| 62 |
# Retrieve the leaderboard DataFrame
|
| 63 |
with open(f"{DATA_PATH}/leaderboard.json", "r", encoding="utf-8") as eval_file:
|
| 64 |
+
battle_info = pd.DataFrame.from_records(json.load(eval_file))
|
| 65 |
+
with open(f"{DATA_PATH}/{SUBMITS_META_FILE}", "r", encoding="utf-8") as submit_meta_file:
|
| 66 |
+
submit_info = pd.DataFrame.from_records(json.load(submit_meta_file))
|
| 67 |
+
return battle_info.copy()
|