|
|
import os |
|
|
import json |
|
|
import datetime |
|
|
import requests |
|
|
from email.utils import parseaddr |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
from datasets import load_dataset, VerificationMode |
|
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
from huggingface_hub import HfApi |
|
|
|
|
|
|
|
|
from scorer import question_scorer |
|
|
from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink |
|
|
|
|
|
TOKEN = os.environ.get("TOKEN", None) |
|
|
|
|
|
OWNER="Blanca" |
|
|
DATA_DATASET = f"{OWNER}/CQs-Gen_test" |
|
|
INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test" |
|
|
SUBMISSION_DATASET = f"{OWNER}/submissions_internal" |
|
|
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public" |
|
|
|
|
|
RESULTS_DATASET = f"{OWNER}/results_public" |
|
|
LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard" |
|
|
api = HfApi() |
|
|
|
|
|
YEAR_VERSION = "2025" |
|
|
ref_scores_len = {"test": 34} |
|
|
|
|
|
|
|
|
os.makedirs("scored", exist_ok=True) |
|
|
|
|
|
|
|
|
LOCAL_DEBUG = False |
|
|
|
|
|
|
|
|
test_results = load_dataset( |
|
|
RESULTS_DATASET, |
|
|
YEAR_VERSION, |
|
|
split="test", |
|
|
token=TOKEN, |
|
|
download_mode="force_redownload", |
|
|
verification_mode=VerificationMode.NO_CHECKS, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
eval_results = {"test": test_results} |
|
|
|
|
|
def get_dataframe_from_results(eval_results, split): |
|
|
local_df = eval_results[split] |
|
|
local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) |
|
|
local_df = local_df.remove_columns(["system_prompt", "url"]) |
|
|
local_df = local_df.rename_column("model", "Agent name") |
|
|
local_df = local_df.rename_column("model_family", "Model family") |
|
|
local_df = local_df.rename_column("score", "Score (%)") |
|
|
local_df = local_df.rename_column("date", "Submission date") |
|
|
df = pd.DataFrame(local_df) |
|
|
df = df.sort_values(by=["Score (%)"], ascending=False) |
|
|
|
|
|
|
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") |
|
|
|
|
|
|
|
|
gold_results = {} |
|
|
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True) |
|
|
|
|
|
|
|
|
|
|
|
def restart_space(): |
|
|
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) |
|
|
|
|
|
TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"] |
|
|
|
|
|
def add_new_eval( |
|
|
model: str, |
|
|
model_family: str, |
|
|
system_prompt: str, |
|
|
url: str, |
|
|
path_to_file: str, |
|
|
organisation: str, |
|
|
mail: str, |
|
|
profile: gr.OAuthProfile, |
|
|
): |
|
|
|
|
|
user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview") |
|
|
creation_date = json.loads(user_data.content)["createdAt"] |
|
|
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60): |
|
|
return format_error("This account is not authorized to submit on this leaderboard.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
val_or_test = "test" |
|
|
is_validation = False |
|
|
|
|
|
_, parsed_mail = parseaddr(mail) |
|
|
if not "@" in parsed_mail: |
|
|
return format_warning("Please provide a valid email adress.") |
|
|
|
|
|
print("Adding new eval") |
|
|
|
|
|
|
|
|
if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]): |
|
|
return format_warning("This model has been already submitted.") |
|
|
|
|
|
if path_to_file is None: |
|
|
return format_warning("Please attach a file.") |
|
|
|
|
|
|
|
|
if LOCAL_DEBUG: |
|
|
print("mock uploaded submission") |
|
|
else: |
|
|
api.upload_file( |
|
|
repo_id=SUBMISSION_DATASET, |
|
|
path_or_fileobj=path_to_file.name, |
|
|
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl", |
|
|
repo_type="dataset", |
|
|
token=TOKEN |
|
|
) |
|
|
|
|
|
|
|
|
contact_info = { |
|
|
"model": model, |
|
|
"model_family": model_family, |
|
|
"url": url, |
|
|
"organisation": organisation, |
|
|
"username": profile.username, |
|
|
"mail": mail, |
|
|
"date": datetime.datetime.today().strftime('%Y-%m-%d') |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
file_path = path_to_file.name |
|
|
scores = 0 |
|
|
num_questions = 0 |
|
|
task_ids = [] |
|
|
|
|
|
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: |
|
|
with open(file_path, 'r') as f: |
|
|
data = json.load(f) |
|
|
for ix, line in data.items(): |
|
|
return format_error(os.getcwd()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
score = 1 |
|
|
|
|
|
scored_file.write( |
|
|
json.dumps({ |
|
|
"id": task_id, |
|
|
|
|
|
"score": score |
|
|
}) + "\n" |
|
|
) |
|
|
|
|
|
task_ids.append(task_id) |
|
|
scores += score |
|
|
num_questions += 1 |
|
|
|
|
|
|
|
|
|
|
|
if len(task_ids) != len(set(task_ids)): |
|
|
return format_error("There are duplicates in your submission. Please check your file and resubmit it.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if LOCAL_DEBUG: |
|
|
print("mock uploaded scored submission") |
|
|
else: |
|
|
api.upload_file( |
|
|
repo_id=SUBMISSION_DATASET, |
|
|
path_or_fileobj=f"scored/{organisation}_{model}.jsonl", |
|
|
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", |
|
|
repo_type="dataset", |
|
|
token=TOKEN |
|
|
) |
|
|
|
|
|
|
|
|
if is_validation: |
|
|
api.upload_file( |
|
|
repo_id=SUBMISSION_DATASET_PUBLIC, |
|
|
path_or_fileobj=f"scored/{organisation}_{model}.jsonl", |
|
|
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", |
|
|
repo_type="dataset", |
|
|
token=TOKEN |
|
|
) |
|
|
|
|
|
|
|
|
eval_entry = { |
|
|
"model": model, |
|
|
"model_family": model_family, |
|
|
"system_prompt": system_prompt, |
|
|
"url": url, |
|
|
"organisation": organisation, |
|
|
"score": scores / ref_scores_len, |
|
|
|
|
|
|
|
|
|
|
|
"date": datetime.datetime.today().strftime('%Y-%m-%d') |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) |
|
|
print(eval_results) |
|
|
if LOCAL_DEBUG: |
|
|
print("mock uploaded results to lb") |
|
|
else: |
|
|
eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN) |
|
|
|
|
|
|
|
|
return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.") |
|
|
|
|
|
|
|
|
def refresh(): |
|
|
test_results = load_dataset( |
|
|
RESULTS_DATASET, |
|
|
YEAR_VERSION, |
|
|
split="test", |
|
|
token=TOKEN, |
|
|
download_mode="force_redownload", |
|
|
verification_mode=VerificationMode.NO_CHECKS, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
eval_dataframe_test = get_dataframe_from_results(eval_results={"test": test_results}, split="test") |
|
|
return eval_dataframe_test |
|
|
|
|
|
|
|
|
|
|
|
def upload_file(files): |
|
|
file_paths = [file.name for file in files] |
|
|
return file_paths |
|
|
|
|
|
|
|
|
demo = gr.Blocks() |
|
|
with demo: |
|
|
gr.HTML(TITLE) |
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Accordion("📙 Citation", open=False): |
|
|
citation_button = gr.Textbox( |
|
|
value=CITATION_BUTTON_TEXT, |
|
|
label=CITATION_BUTTON_LABEL, |
|
|
elem_id="citation-button", |
|
|
) |
|
|
|
|
|
with gr.Tab("Results: Test"): |
|
|
leaderboard_table_test = gr.components.Dataframe( |
|
|
value=eval_dataframe_test, datatype=TYPES, interactive=False, |
|
|
column_widths=["20%"] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
refresh_button = gr.Button("Refresh") |
|
|
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test]) |
|
|
|
|
|
with gr.Accordion(""): |
|
|
with gr.Row(): |
|
|
gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
level_of_test = gr.Radio(["test"], value="test", label="Split") |
|
|
model_name_textbox = gr.Textbox(label="Agent name") |
|
|
model_family_textbox = gr.Textbox(label="Model family") |
|
|
system_prompt_textbox = gr.Textbox(label="System prompt example") |
|
|
url_textbox = gr.Textbox(label="Url to model information") |
|
|
with gr.Column(): |
|
|
organisation = gr.Textbox(label="Organisation") |
|
|
mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)") |
|
|
file_output = gr.File() |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
gr.LoginButton() |
|
|
submit_button = gr.Button("Submit Eval") |
|
|
submission_result = gr.Markdown() |
|
|
submit_button.click( |
|
|
add_new_eval, |
|
|
[ |
|
|
model_name_textbox, |
|
|
model_family_textbox, |
|
|
system_prompt_textbox, |
|
|
url_textbox, |
|
|
file_output, |
|
|
organisation, |
|
|
mail |
|
|
], |
|
|
submission_result, |
|
|
) |
|
|
|
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
|
scheduler.add_job(restart_space, "interval", seconds=3600) |
|
|
scheduler.start() |
|
|
demo.launch(debug=True) |
|
|
|