Blanca commited on
Commit
6bf01ee
·
verified ·
1 Parent(s): cc4f8bc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -0
app.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ import requests
5
+ from email.utils import parseaddr
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ import numpy as np
10
+
11
+ from datasets import load_dataset, VerificationMode
12
+ from apscheduler.schedulers.background import BackgroundScheduler
13
+ from huggingface_hub import HfApi
14
+
15
+ # InfoStrings
16
+ from scorer import question_scorer
17
+ from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
18
+
19
+ TOKEN = os.environ.get("TOKEN", None)
20
+
21
+ OWNER="Blanca"
22
+ DATA_DATASET = f"gaia-benchmark/GAIA"
23
+ INTERNAL_DATA_DATASET = f"gaia-benchmark/GAIA"
24
+ SUBMISSION_DATASET = f"{DATA_DATASET}/test.jsonl"
25
+ SUBMISSION_DATASET_PUBLIC = f"gaia-benchmark/submissions_public"
26
+ #CONTACT_DATASET = f"{OWNER}/contact_info"
27
+ RESULTS_DATASET = f"gaia-benchmark/results_public"
28
+ LEADERBOARD_PATH = f"{OWNER}/Critical_Questions_leaderboard"
29
+ api = HfApi()
30
+
31
+ YEAR_VERSION = "2023"
32
+ ref_scores_len = {"test": 301}
33
+ ref_level_len = {"test": {1: 93, 2: 159, 3: 49}}
34
+
35
+ os.makedirs("scored", exist_ok=True)
36
+
37
+ # Should be False on spaces and True outside
38
+ LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
39
+
40
+ # Display the results
41
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
42
+ #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
43
+ def get_dataframe_from_results(eval_results, split):
44
+ local_df = eval_results[split]
45
+ local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
46
+ local_df = local_df.remove_columns(["system_prompt", "url"])
47
+ local_df = local_df.rename_column("model", "Agent name")
48
+ local_df = local_df.rename_column("model_family", "Model family")
49
+ local_df = local_df.rename_column("score", "Average score (%)")
50
+ for i in [1, 2, 3]:
51
+ local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
52
+ local_df = local_df.rename_column("date", "Submission date")
53
+ df = pd.DataFrame(local_df)
54
+ df = df.sort_values(by=["Average score (%)"], ascending=False)
55
+
56
+ numeric_cols = [c for c in local_df.column_names if "score" in c]
57
+ df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
58
+ #df = df.style.format("{:.2%}", subset=numeric_cols)
59
+
60
+ return df
61
+
62
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
63
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
64
+
65
+ # Gold answers
66
+ gold_results = {}
67
+ gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN, trust_remote_code=True)
68
+ gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test"]}
69
+
70
+
71
+ def restart_space():
72
+ api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
73
+
74
+ TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]
75
+
76
+ def add_new_eval(
77
+ val_or_test: str,
78
+ model: str,
79
+ model_family: str,
80
+ system_prompt: str,
81
+ url: str,
82
+ path_to_file: str,
83
+ organisation: str,
84
+ mail: str,
85
+ profile: gr.OAuthProfile,
86
+ ):
87
+ # Was the profile created less than 2 month ago?
88
+ user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
89
+ creation_date = json.loads(user_data.content)["createdAt"]
90
+ if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
91
+ return format_error("This account is not authorized to submit on GAIA.")
92
+
93
+
94
+ #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
95
+ #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
96
+ #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
97
+ # return format_error("You already submitted once today, please try again tomorrow.")
98
+
99
+
100
+ #is_validation = val_or_test == "validation"
101
+ # Very basic email parsing
102
+ _, parsed_mail = parseaddr(mail)
103
+ if not "@" in parsed_mail:
104
+ return format_warning("Please provide a valid email adress.")
105
+
106
+ print("Adding new eval")
107
+
108
+ # Check if the combination model/org already exists and prints a warning message if yes
109
+ if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
110
+ return format_warning("This model has been already submitted.")
111
+
112
+ if path_to_file is None:
113
+ return format_warning("Please attach a file.")
114
+
115
+ # SAVE UNSCORED SUBMISSION
116
+ if LOCAL_DEBUG:
117
+ print("mock uploaded submission")
118
+ else:
119
+ api.upload_file(
120
+ repo_id=SUBMISSION_DATASET,
121
+ path_or_fileobj=path_to_file.name,
122
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
123
+ repo_type="dataset",
124
+ token=TOKEN
125
+ )
126
+
127
+ # SAVE CONTACT
128
+ contact_info = {
129
+ "model": model,
130
+ "model_family": model_family,
131
+ "url": url,
132
+ "organisation": organisation,
133
+ "username": profile.username,
134
+ "mail": mail,
135
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
136
+ }
137
+ #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
138
+ if LOCAL_DEBUG:
139
+ print("mock uploaded contact info")
140
+ #else:
141
+ # contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
142
+
143
+ # SCORE SUBMISSION
144
+ file_path = path_to_file.name
145
+ scores = {"all": 0, 1: 0, 2: 0, 3: 0}
146
+ num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
147
+ task_ids = []
148
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
149
+ with open(file_path, 'r') as f:
150
+ for ix, line in enumerate(f):
151
+ try:
152
+ task = json.loads(line)
153
+ except Exception:
154
+ return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
155
+
156
+ if "model_answer" not in task:
157
+ return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
158
+ answer = task["model_answer"]
159
+ task_id = task["task_id"]
160
+ try:
161
+ level = int(gold_results[val_or_test][task_id]["Level"])
162
+ except KeyError:
163
+ return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
164
+
165
+ score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
166
+
167
+ scored_file.write(
168
+ json.dumps({
169
+ "id": task_id,
170
+ "model_answer": answer,
171
+ "score": score,
172
+ "level": level
173
+ }) + "\n"
174
+ )
175
+ task_ids.append(task_id)
176
+
177
+ scores["all"] += score
178
+ scores[level] += score
179
+ num_questions["all"] += 1
180
+ num_questions[level] += 1
181
+
182
+ # Check if there's any duplicate in the submission
183
+ if len(task_ids) != len(set(task_ids)):
184
+ return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
185
+
186
+ if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
187
+ return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
188
+
189
+ # SAVE SCORED SUBMISSION
190
+ if LOCAL_DEBUG:
191
+ print("mock uploaded scored submission")
192
+ else:
193
+ api.upload_file(
194
+ repo_id=SUBMISSION_DATASET,
195
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
196
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
197
+ repo_type="dataset",
198
+ token=TOKEN
199
+ )
200
+
201
+ # Save scored file
202
+ if is_validation:
203
+ api.upload_file(
204
+ repo_id=SUBMISSION_DATASET_PUBLIC,
205
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
206
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
207
+ repo_type="dataset",
208
+ token=TOKEN
209
+ )
210
+
211
+ # SAVE TO LEADERBOARD DATA
212
+ eval_entry = {
213
+ "model": model,
214
+ "model_family": model_family,
215
+ "system_prompt": system_prompt,
216
+ "url": url,
217
+ "organisation": organisation,
218
+ "score": scores["all"]/ref_scores_len[val_or_test],
219
+ "score_level1": scores[1]/num_questions[1],
220
+ "score_level2": scores[2]/num_questions[2],
221
+ "score_level3": scores[3]/num_questions[3],
222
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
223
+ }
224
+ if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
225
+ return format_error(f"Your submission has {len(scores['all'])} questions for the test set, but it should have {ref_scores_len['test']}. Please check your submission.")
226
+ # Catching spam submissions of 100%
227
+ if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
228
+ return format_error(f"There was a problem with your submission. Please open a discussion.")
229
+
230
+ # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
231
+ #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
232
+ #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
233
+ #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
234
+ # return format_error(f"Your submission is an exact duplicate from an existing submission.")
235
+
236
+ eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
237
+ print(eval_results)
238
+ if LOCAL_DEBUG:
239
+ print("mock uploaded results to lb")
240
+ else:
241
+ eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
242
+
243
+
244
+ return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
245
+
246
+
247
+ def refresh():
248
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS,trust_remote_code=True)
249
+ #eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
250
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
251
+ return eval_dataframe_test
252
+
253
+ def upload_file(files):
254
+ file_paths = [file.name for file in files]
255
+ return file_paths
256
+
257
+
258
+ demo = gr.Blocks()
259
+ with demo:
260
+ gr.HTML(TITLE)
261
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
262
+
263
+ with gr.Row():
264
+ with gr.Accordion("📙 Citation", open=False):
265
+ citation_button = gr.Textbox(
266
+ value=CITATION_BUTTON_TEXT,
267
+ label=CITATION_BUTTON_LABEL,
268
+ elem_id="citation-button",
269
+ ) #.style(show_copy_button=True)
270
+
271
+ with gr.Tab("Results: Test"):
272
+ leaderboard_table_test = gr.components.Dataframe(
273
+ value=eval_dataframe_test, datatype=TYPES, interactive=False,
274
+ column_widths=["20%"]
275
+ )
276
+ with gr.Tab("Results: Validation"):
277
+ leaderboard_table_val = gr.components.Dataframe(
278
+ value=eval_dataframe_val, datatype=TYPES, interactive=False,
279
+ column_widths=["20%"]
280
+ )
281
+
282
+ refresh_button = gr.Button("Refresh")
283
+ refresh_button.click(
284
+ refresh,
285
+ inputs=[],
286
+ outputs=[
287
+ leaderboard_table_val,
288
+ leaderboard_table_test,
289
+ ],
290
+ )
291
+ with gr.Accordion("Submit a new model for evaluation"):
292
+ with gr.Row():
293
+ gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
294
+ with gr.Row():
295
+ with gr.Column():
296
+ level_of_test = gr.Radio(["test"], value="test", label="Split")
297
+ model_name_textbox = gr.Textbox(label="Agent name")
298
+ model_family_textbox = gr.Textbox(label="Model family")
299
+ system_prompt_textbox = gr.Textbox(label="System prompt example")
300
+ url_textbox = gr.Textbox(label="Url to model information")
301
+ with gr.Column():
302
+ organisation = gr.Textbox(label="Organisation")
303
+ mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)")
304
+ file_output = gr.File()
305
+
306
+
307
+ with gr.Row():
308
+ gr.LoginButton()
309
+ submit_button = gr.Button("Submit Eval")
310
+ submission_result = gr.Markdown()
311
+ submit_button.click(
312
+ add_new_eval,
313
+ [
314
+ level_of_test,
315
+ model_name_textbox,
316
+ model_family_textbox,
317
+ system_prompt_textbox,
318
+ url_textbox,
319
+ file_output,
320
+ organisation,
321
+ mail
322
+ ],
323
+ submission_result,
324
+ )
325
+
326
+ scheduler = BackgroundScheduler()
327
+ scheduler.add_job(restart_space, "interval", seconds=3600)
328
+ scheduler.start()
329
+ demo.launch(debug=True)