Blanca commited on
Commit
b2b0ba4
·
verified ·
1 Parent(s): 36f9b6b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -68
app.py CHANGED
@@ -19,18 +19,18 @@ from content import format_error, format_warning, format_log, TITLE, INTRODUCTIO
19
  TOKEN = os.environ.get("TOKEN", None)
20
 
21
  OWNER="Blanca"
22
- DATA_DATASET = f"gaia-benchmark/GAIA"
23
- INTERNAL_DATA_DATASET = f"gaia-benchmark/GAIA"
24
- SUBMISSION_DATASET = f"{DATA_DATASET}/test.jsonl"
25
- SUBMISSION_DATASET_PUBLIC = f"gaia-benchmark/submissions_public"
26
- #CONTACT_DATASET = f"{OWNER}/contact_info"
27
- RESULTS_DATASET = f"gaia-benchmark/results_public"
28
- LEADERBOARD_PATH = f"{OWNER}/Critical_Questions_leaderboard"
29
  api = HfApi()
30
 
31
- YEAR_VERSION = "2023"
32
- ref_scores_len = {"test": 301}
33
- ref_level_len = {"test": {1: 93, 2: 159, 3: 49}}
34
 
35
  os.makedirs("scored", exist_ok=True)
36
 
@@ -39,33 +39,29 @@ LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
39
 
40
  # Display the results
41
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
42
- #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
43
  def get_dataframe_from_results(eval_results, split):
44
  local_df = eval_results[split]
45
  local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
46
  local_df = local_df.remove_columns(["system_prompt", "url"])
47
  local_df = local_df.rename_column("model", "Agent name")
48
  local_df = local_df.rename_column("model_family", "Model family")
49
- local_df = local_df.rename_column("score", "Average score (%)")
50
- for i in [1, 2, 3]:
51
- local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
52
  local_df = local_df.rename_column("date", "Submission date")
53
  df = pd.DataFrame(local_df)
54
- df = df.sort_values(by=["Average score (%)"], ascending=False)
55
 
56
- numeric_cols = [c for c in local_df.column_names if "score" in c]
57
- df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
58
- #df = df.style.format("{:.2%}", subset=numeric_cols)
59
 
60
  return df
61
 
62
- eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
63
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
64
 
65
  # Gold answers
66
  gold_results = {}
67
- gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN, trust_remote_code=True)
68
- gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test"]}
69
 
70
 
71
  def restart_space():
@@ -88,16 +84,16 @@ def add_new_eval(
88
  user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
89
  creation_date = json.loads(user_data.content)["createdAt"]
90
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
91
- return format_error("This account is not authorized to submit on GAIA.")
92
 
93
 
94
- #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
95
- #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
96
- #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
97
- # return format_error("You already submitted once today, please try again tomorrow.")
98
 
99
 
100
- #is_validation = val_or_test == "validation"
101
  # Very basic email parsing
102
  _, parsed_mail = parseaddr(mail)
103
  if not "@" in parsed_mail:
@@ -134,17 +130,18 @@ def add_new_eval(
134
  "mail": mail,
135
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
136
  }
137
- #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
138
  if LOCAL_DEBUG:
139
  print("mock uploaded contact info")
140
- #else:
141
- # contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
142
 
143
  # SCORE SUBMISSION
144
  file_path = path_to_file.name
145
- scores = {"all": 0, 1: 0, 2: 0, 3: 0}
146
- num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
147
  task_ids = []
 
148
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
149
  with open(file_path, 'r') as f:
150
  for ix, line in enumerate(f):
@@ -152,39 +149,35 @@ def add_new_eval(
152
  task = json.loads(line)
153
  except Exception:
154
  return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
155
-
156
  if "model_answer" not in task:
157
- return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
158
  answer = task["model_answer"]
159
  task_id = task["task_id"]
160
- try:
161
- level = int(gold_results[val_or_test][task_id]["Level"])
162
- except KeyError:
163
- return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
164
 
165
- score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
166
-
 
 
 
167
  scored_file.write(
168
  json.dumps({
169
  "id": task_id,
170
  "model_answer": answer,
171
- "score": score,
172
- "level": level
173
  }) + "\n"
174
  )
 
175
  task_ids.append(task_id)
 
 
176
 
177
- scores["all"] += score
178
- scores[level] += score
179
- num_questions["all"] += 1
180
- num_questions[level] += 1
181
 
182
  # Check if there's any duplicate in the submission
183
  if len(task_ids) != len(set(task_ids)):
184
  return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
185
 
186
- if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
187
- return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
188
 
189
  # SAVE SCORED SUBMISSION
190
  if LOCAL_DEBUG:
@@ -215,14 +208,14 @@ def add_new_eval(
215
  "system_prompt": system_prompt,
216
  "url": url,
217
  "organisation": organisation,
218
- "score": scores["all"]/ref_scores_len[val_or_test],
219
- "score_level1": scores[1]/num_questions[1],
220
- "score_level2": scores[2]/num_questions[2],
221
- "score_level3": scores[3]/num_questions[3],
222
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
223
  }
224
  if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
225
- return format_error(f"Your submission has {len(scores['all'])} questions for the test set, but it should have {ref_scores_len['test']}. Please check your submission.")
226
  # Catching spam submissions of 100%
227
  if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
228
  return format_error(f"There was a problem with your submission. Please open a discussion.")
@@ -246,10 +239,10 @@ def add_new_eval(
246
 
247
  def refresh():
248
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS,trust_remote_code=True)
249
- #eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
250
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
251
  return eval_dataframe_test
252
 
 
253
  def upload_file(files):
254
  file_paths = [file.name for file in files]
255
  return file_paths
@@ -273,21 +266,15 @@ with demo:
273
  value=eval_dataframe_test, datatype=TYPES, interactive=False,
274
  column_widths=["20%"]
275
  )
276
- with gr.Tab("Results: Validation"):
277
- leaderboard_table_val = gr.components.Dataframe(
278
- value=eval_dataframe_val, datatype=TYPES, interactive=False,
279
- column_widths=["20%"]
280
- )
281
 
282
  refresh_button = gr.Button("Refresh")
283
- refresh_button.click(
284
- refresh,
285
- inputs=[],
286
- outputs=[
287
- leaderboard_table_val,
288
- leaderboard_table_test,
289
- ],
290
- )
291
  with gr.Accordion("Submit a new model for evaluation"):
292
  with gr.Row():
293
  gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
@@ -326,4 +313,4 @@ with demo:
326
  scheduler = BackgroundScheduler()
327
  scheduler.add_job(restart_space, "interval", seconds=3600)
328
  scheduler.start()
329
- demo.launch(debug=True)
 
19
  TOKEN = os.environ.get("TOKEN", None)
20
 
21
  OWNER="Blanca"
22
+ DATA_DATASET = f"{OWNER}/critical_questions_generation"
23
+ INTERNAL_DATA_DATASET = f"{OWNER}/critical_questions_generation"
24
+ SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
25
+ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
26
+ CONTACT_DATASET = f"{OWNER}/contact_info"
27
+ RESULTS_DATASET = f"{OWNER}/results_public"
28
+ LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
29
  api = HfApi()
30
 
31
+ YEAR_VERSION = "2025"
32
+ ref_scores_len = {"test": 34}
33
+ #ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
34
 
35
  os.makedirs("scored", exist_ok=True)
36
 
 
39
 
40
  # Display the results
41
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
42
+ contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
43
  def get_dataframe_from_results(eval_results, split):
44
  local_df = eval_results[split]
45
  local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
46
  local_df = local_df.remove_columns(["system_prompt", "url"])
47
  local_df = local_df.rename_column("model", "Agent name")
48
  local_df = local_df.rename_column("model_family", "Model family")
49
+ local_df = local_df.rename_column("score", "Score (%)")
 
 
50
  local_df = local_df.rename_column("date", "Submission date")
51
  df = pd.DataFrame(local_df)
52
+ df = df.sort_values(by=["Score (%)"], ascending=False)
53
 
54
+ df["Score (%)"] = df["Score (%)"].multiply(100).round(2)
 
 
55
 
56
  return df
57
 
58
+
59
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
60
 
61
  # Gold answers
62
  gold_results = {}
63
+ gold_dataset = load_dataset(INTERNAL_DATA_DATASET, token=TOKEN, trust_remote_code=True)
64
+ gold_results = {"test": {row["intervention_id"]: row for row in gold_dataset["test"]}}
65
 
66
 
67
  def restart_space():
 
84
  user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
85
  creation_date = json.loads(user_data.content)["createdAt"]
86
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
87
+ return format_error("This account is not authorized to submit on this leaderboard.")
88
 
89
 
90
+ contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
91
+ user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
92
+ if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
93
+ return format_error("You already submitted once today, please try again tomorrow.")
94
 
95
 
96
+ is_validation = val_or_test == "validation"
97
  # Very basic email parsing
98
  _, parsed_mail = parseaddr(mail)
99
  if not "@" in parsed_mail:
 
130
  "mail": mail,
131
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
132
  }
133
+ contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
134
  if LOCAL_DEBUG:
135
  print("mock uploaded contact info")
136
+ else:
137
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
138
 
139
  # SCORE SUBMISSION
140
  file_path = path_to_file.name
141
+ scores = 0
142
+ num_questions = 0
143
  task_ids = []
144
+
145
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
146
  with open(file_path, 'r') as f:
147
  for ix, line in enumerate(f):
 
149
  task = json.loads(line)
150
  except Exception:
151
  return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
 
152
  if "model_answer" not in task:
153
+ return format_error(f"Line {ix} missing 'model_answer'.")
154
  answer = task["model_answer"]
155
  task_id = task["task_id"]
 
 
 
 
156
 
157
+ if task_id not in gold_results[val_or_test]:
158
+ return format_error(f"{task_id} not found in gold set.")
159
+
160
+ score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
161
+
162
  scored_file.write(
163
  json.dumps({
164
  "id": task_id,
165
  "model_answer": answer,
166
+ "score": score
 
167
  }) + "\n"
168
  )
169
+
170
  task_ids.append(task_id)
171
+ scores += score
172
+ num_questions += 1
173
 
 
 
 
 
174
 
175
  # Check if there's any duplicate in the submission
176
  if len(task_ids) != len(set(task_ids)):
177
  return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
178
 
179
+ #if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
180
+ # return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
181
 
182
  # SAVE SCORED SUBMISSION
183
  if LOCAL_DEBUG:
 
208
  "system_prompt": system_prompt,
209
  "url": url,
210
  "organisation": organisation,
211
+ "score": scores / ref_scores_len,#[val_or_test],
212
+ #"score_level1": scores[1]/num_questions[1],
213
+ #"score_level2": scores[2]/num_questions[2],
214
+ #"score_level3": scores[3]/num_questions[3],
215
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
216
  }
217
  if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
218
+ return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
219
  # Catching spam submissions of 100%
220
  if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
221
  return format_error(f"There was a problem with your submission. Please open a discussion.")
 
239
 
240
  def refresh():
241
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS,trust_remote_code=True)
 
242
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
243
  return eval_dataframe_test
244
 
245
+
246
  def upload_file(files):
247
  file_paths = [file.name for file in files]
248
  return file_paths
 
266
  value=eval_dataframe_test, datatype=TYPES, interactive=False,
267
  column_widths=["20%"]
268
  )
269
+ #with gr.Tab("Results: Validation"):
270
+ # leaderboard_table_val = gr.components.Dataframe(
271
+ # value=eval_dataframe_val, datatype=TYPES, interactive=False,
272
+ # column_widths=["20%"]
273
+ # )
274
 
275
  refresh_button = gr.Button("Refresh")
276
+ refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])
277
+
 
 
 
 
 
 
278
  with gr.Accordion("Submit a new model for evaluation"):
279
  with gr.Row():
280
  gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
 
313
  scheduler = BackgroundScheduler()
314
  scheduler.add_job(restart_space, "interval", seconds=3600)
315
  scheduler.start()
316
+ demo.launch(debug=True)