Spaces:

HiTZ
/

Critical_Questions_Leaderboard

Running

App Files Files Community

Blanca commited on Jun 9

Commit

b81b835

verified ·

1 Parent(s): e5cf768

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -6

app.py CHANGED Viewed

@@ -70,7 +70,6 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, spli
 # Gold answers
 gold_results = {}
 gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)
-print(gold_dataset["test"])
 #gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
@@ -154,10 +153,31 @@ def add_new_eval(
     with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
         with open(file_path, 'r') as f:
             data = json.load(f)
-            for ix, line in data.items():
-                #return format_error(line['cqs'])
-                # TODO: look at each question, compare it to the references, output a label between 0 and 1
@@ -174,7 +194,7 @@ def add_new_eval(
                 #    return format_error(f"{task_id} not found in gold set.")
                 #score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
-                score = 1
                 scored_file.write(
                     json.dumps({

 # Gold answers
 gold_results = {}
 gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)
 #gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
     with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
         with open(file_path, 'r') as f:
             data = json.load(f)
+            for id_to_eval, line in data.items():
+                score = 0
+                for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
+                    if id_to_eval == intervention_id:
+                        references = gold_dataset['cqs']
+                        reference_set = [row['cq'] for row in references]
+                        for cq in line['cqs']:
+                            # TODO: compare to each reference and get a value
+                            cq_text = cq['cq']
+                            #if args.metric == 'similarity':
+                            sentence_embedding = model.encode(cq_text)
+                            reference_embedding = model.encode(reference_set)
+                            sims = model.similarity(sentence_embedding, reference_embedding).tolist()[0]
+                            winner = np.argmax(sims)
+                            # make sure the similarity of the winning reference sentence is at least 0.65
+                            if sims[winner] > 0.65:
+                                label = references[indx][winner]['label']
+                                if label == 'Useful':
+                                    score += 1/3
+                            #else:
+                            #    label = 'not_able_to_evaluate'
+                return format_error(score)
                 #    return format_error(f"{task_id} not found in gold set.")
                 #score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
+                #score = 1
                 scored_file.write(
                     json.dumps({