Update app.py
Browse files
app.py
CHANGED
|
@@ -70,7 +70,6 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, spli
|
|
| 70 |
# Gold answers
|
| 71 |
gold_results = {}
|
| 72 |
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)
|
| 73 |
-
print(gold_dataset["test"])
|
| 74 |
#gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
|
| 75 |
|
| 76 |
|
|
@@ -154,10 +153,31 @@ def add_new_eval(
|
|
| 154 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
|
| 155 |
with open(file_path, 'r') as f:
|
| 156 |
data = json.load(f)
|
| 157 |
-
for
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
|
| 163 |
|
|
@@ -174,7 +194,7 @@ def add_new_eval(
|
|
| 174 |
# return format_error(f"{task_id} not found in gold set.")
|
| 175 |
|
| 176 |
#score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
|
| 177 |
-
score = 1
|
| 178 |
|
| 179 |
scored_file.write(
|
| 180 |
json.dumps({
|
|
|
|
| 70 |
# Gold answers
|
| 71 |
gold_results = {}
|
| 72 |
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)
|
|
|
|
| 73 |
#gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
|
| 74 |
|
| 75 |
|
|
|
|
| 153 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
|
| 154 |
with open(file_path, 'r') as f:
|
| 155 |
data = json.load(f)
|
| 156 |
+
for id_to_eval, line in data.items():
|
| 157 |
+
score = 0
|
| 158 |
+
for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
|
| 159 |
+
if id_to_eval == intervention_id:
|
| 160 |
+
references = gold_dataset['cqs']
|
| 161 |
+
reference_set = [row['cq'] for row in references]
|
| 162 |
+
for cq in line['cqs']:
|
| 163 |
+
# TODO: compare to each reference and get a value
|
| 164 |
+
cq_text = cq['cq']
|
| 165 |
+
|
| 166 |
+
#if args.metric == 'similarity':
|
| 167 |
+
sentence_embedding = model.encode(cq_text)
|
| 168 |
+
reference_embedding = model.encode(reference_set)
|
| 169 |
+
sims = model.similarity(sentence_embedding, reference_embedding).tolist()[0]
|
| 170 |
+
|
| 171 |
+
winner = np.argmax(sims)
|
| 172 |
+
# make sure the similarity of the winning reference sentence is at least 0.65
|
| 173 |
+
if sims[winner] > 0.65:
|
| 174 |
+
label = references[indx][winner]['label']
|
| 175 |
+
if label == 'Useful':
|
| 176 |
+
score += 1/3
|
| 177 |
+
#else:
|
| 178 |
+
# label = 'not_able_to_evaluate'
|
| 179 |
+
|
| 180 |
+
return format_error(score)
|
| 181 |
|
| 182 |
|
| 183 |
|
|
|
|
| 194 |
# return format_error(f"{task_id} not found in gold set.")
|
| 195 |
|
| 196 |
#score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
|
| 197 |
+
#score = 1
|
| 198 |
|
| 199 |
scored_file.write(
|
| 200 |
json.dumps({
|