Blanca commited on
Commit
b81b835
·
verified ·
1 Parent(s): e5cf768

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -6
app.py CHANGED
@@ -70,7 +70,6 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, spli
70
  # Gold answers
71
  gold_results = {}
72
  gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)
73
- print(gold_dataset["test"])
74
  #gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
75
 
76
 
@@ -154,10 +153,31 @@ def add_new_eval(
154
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
155
  with open(file_path, 'r') as f:
156
  data = json.load(f)
157
- for ix, line in data.items():
158
- #return format_error(line['cqs'])
159
-
160
- # TODO: look at each question, compare it to the references, output a label between 0 and 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
 
163
 
@@ -174,7 +194,7 @@ def add_new_eval(
174
  # return format_error(f"{task_id} not found in gold set.")
175
 
176
  #score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
177
- score = 1
178
 
179
  scored_file.write(
180
  json.dumps({
 
70
  # Gold answers
71
  gold_results = {}
72
  gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)
 
73
  #gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
74
 
75
 
 
153
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
154
  with open(file_path, 'r') as f:
155
  data = json.load(f)
156
+ for id_to_eval, line in data.items():
157
+ score = 0
158
+ for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
159
+ if id_to_eval == intervention_id:
160
+ references = gold_dataset['cqs']
161
+ reference_set = [row['cq'] for row in references]
162
+ for cq in line['cqs']:
163
+ # TODO: compare to each reference and get a value
164
+ cq_text = cq['cq']
165
+
166
+ #if args.metric == 'similarity':
167
+ sentence_embedding = model.encode(cq_text)
168
+ reference_embedding = model.encode(reference_set)
169
+ sims = model.similarity(sentence_embedding, reference_embedding).tolist()[0]
170
+
171
+ winner = np.argmax(sims)
172
+ # make sure the similarity of the winning reference sentence is at least 0.65
173
+ if sims[winner] > 0.65:
174
+ label = references[indx][winner]['label']
175
+ if label == 'Useful':
176
+ score += 1/3
177
+ #else:
178
+ # label = 'not_able_to_evaluate'
179
+
180
+ return format_error(score)
181
 
182
 
183
 
 
194
  # return format_error(f"{task_id} not found in gold set.")
195
 
196
  #score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
197
+ #score = 1
198
 
199
  scored_file.write(
200
  json.dumps({