import os import json import datetime import requests from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset, VerificationMode from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink TOKEN = os.environ.get("TOKEN", None) OWNER="Blanca" DATA_DATASET = f"{OWNER}/CQs-Gen_test" INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test" SUBMISSION_DATASET = f"{OWNER}/submissions_internal" SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public" #CONTACT_DATASET = f"{OWNER}/contact_info" RESULTS_DATASET = f"{OWNER}/results_public" LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard" api = HfApi() YEAR_VERSION = "2025" ref_scores_len = {"test": 34} #ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}} os.makedirs("scored", exist_ok=True) # Should be False on spaces and True outside LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces") # Display the results test_results = load_dataset( RESULTS_DATASET, YEAR_VERSION, split="test", token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True, ) eval_results = {"test": test_results} #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True) def get_dataframe_from_results(eval_results, split): local_df = eval_results[split] local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) local_df = local_df.remove_columns(["system_prompt", "url"]) local_df = local_df.rename_column("model", "Agent name") local_df = local_df.rename_column("model_family", "Model family") local_df = local_df.rename_column("score", "Score (%)") local_df = local_df.rename_column("date", "Submission date") df = pd.DataFrame(local_df) df = df.sort_values(by=["Score (%)"], ascending=False) df["Score (%)"] = df["Score (%)"].multiply(100).round(2) return df eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") # Gold answers gold_results = {} gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True) gold_results = {"test": {row["intervention_id"]: row for row in gold_dataset["test"]}} def restart_space(): api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"] def add_new_eval( val_or_test: str, model: str, model_family: str, system_prompt: str, url: str, path_to_file: str, organisation: str, mail: str, profile: gr.OAuthProfile, ): # Was the profile created less than 2 month ago? user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview") creation_date = json.loads(user_data.content)["createdAt"] if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60): return format_error("This account is not authorized to submit on this leaderboard.") #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True) #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username) #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'): # return format_error("You already submitted once today, please try again tomorrow.") val_or_test = "test" is_validation = False # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") print("Adding new eval") # Check if the combination model/org already exists and prints a warning message if yes if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]): return format_warning("This model has been already submitted.") if path_to_file is None: return format_warning("Please attach a file.") # SAVE UNSCORED SUBMISSION if LOCAL_DEBUG: print("mock uploaded submission") else: api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=path_to_file.name, path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=TOKEN ) # SAVE CONTACT contact_info = { "model": model, "model_family": model_family, "url": url, "organisation": organisation, "username": profile.username, "mail": mail, "date": datetime.datetime.today().strftime('%Y-%m-%d') } #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info) #if LOCAL_DEBUG: # print("mock uploaded contact info") #else: # contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN) # SCORE SUBMISSION file_path = path_to_file.name scores = 0 num_questions = 0 task_ids = [] with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: with open(file_path, 'r') as f: for ix, line in enumerate(f): try: task = json.loads(line) except Exception: return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.") if "model_answer" not in task: return format_error(f"Line {ix} missing 'model_answer'.") answer = task["model_answer"] task_id = task["task_id"] if task_id not in gold_results[val_or_test]: return format_error(f"{task_id} not found in gold set.") score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"]) scored_file.write( json.dumps({ "id": task_id, "model_answer": answer, "score": score }) + "\n" ) task_ids.append(task_id) scores += score num_questions += 1 # Check if there's any duplicate in the submission if len(task_ids) != len(set(task_ids)): return format_error("There are duplicates in your submission. Please check your file and resubmit it.") #if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]): # return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.") # SAVE SCORED SUBMISSION if LOCAL_DEBUG: print("mock uploaded scored submission") else: api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=f"scored/{organisation}_{model}.jsonl", path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=TOKEN ) # Save scored file if is_validation: api.upload_file( repo_id=SUBMISSION_DATASET_PUBLIC, path_or_fileobj=f"scored/{organisation}_{model}.jsonl", path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=TOKEN ) # SAVE TO LEADERBOARD DATA eval_entry = { "model": model, "model_family": model_family, "system_prompt": system_prompt, "url": url, "organisation": organisation, "score": scores / ref_scores_len,#[val_or_test], #"score_level1": scores[1]/num_questions[1], #"score_level2": scores[2]/num_questions[2], #"score_level3": scores[3]/num_questions[3], "date": datetime.datetime.today().strftime('%Y-%m-%d') } #if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]: # return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.") # Catching spam submissions of 100% #if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])): # return format_error(f"There was a problem with your submission. Please open a discussion.") # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"} #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"] #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date): # return format_error(f"Your submission is an exact duplicate from an existing submission.") eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) print(eval_results) if LOCAL_DEBUG: print("mock uploaded results to lb") else: eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN) return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.") def refresh(): test_results = load_dataset( RESULTS_DATASET, YEAR_VERSION, split="test", token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True, ) eval_dataframe_test = get_dataframe_from_results(eval_results={"test": test_results}, split="test") return eval_dataframe_test def upload_file(files): file_paths = [file.name for file in files] return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ) #.style(show_copy_button=True) with gr.Tab("Results: Test"): leaderboard_table_test = gr.components.Dataframe( value=eval_dataframe_test, datatype=TYPES, interactive=False, column_widths=["20%"] ) #with gr.Tab("Results: Validation"): # leaderboard_table_val = gr.components.Dataframe( # value=eval_dataframe_val, datatype=TYPES, interactive=False, # column_widths=["20%"] # ) refresh_button = gr.Button("Refresh") refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test]) with gr.Accordion("Submit a new model for evaluation"): with gr.Row(): gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["test"], value="test", label="Split") model_name_textbox = gr.Textbox(label="Agent name") model_family_textbox = gr.Textbox(label="Model family") system_prompt_textbox = gr.Textbox(label="System prompt example") url_textbox = gr.Textbox(label="Url to model information") with gr.Column(): organisation = gr.Textbox(label="Organisation") mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)") file_output = gr.File() with gr.Row(): gr.LoginButton() submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( lambda *args: add_new_eval("test", *args), [ model_name_textbox, model_family_textbox, system_prompt_textbox, url_textbox, file_output, organisation, mail ], submission_result, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.launch(debug=True)