Spaces:

HiTZ
/

Critical_Questions_Leaderboard

Running

App Files Files Community

Blanca commited on Jun 10

Commit

6553d2e

verified ·

1 Parent(s): 0a759e0

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -8

app.py CHANGED Viewed

@@ -17,8 +17,6 @@ from sentence_transformers import SentenceTransformer
 from scorer import question_scorer
 from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
-similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
 TOKEN = os.environ.get("TOKEN", None)
 OWNER="Blanca"
@@ -29,9 +27,16 @@ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
 #CONTACT_DATASET = f"{OWNER}/contact_info"
 RESULTS_DATASET = f"{OWNER}/results_public"
 LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
-METRIC = 'similarity'
 api = HfApi()
 YEAR_VERSION = "2025"
 ref_scores_len = {"test": 34}
 #ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
@@ -82,6 +87,61 @@ def restart_space():
 TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]
 def add_new_eval(
     model: str,
     model_family: str,
@@ -165,7 +225,6 @@ def add_new_eval(
                         reference_set = [row['cq'] for row in references[indx]]
                         #print(reference_set, flush=True)
                         for cq in line['cqs']:
-                            # TODO: compare to each reference and get a value
                             cq_text = cq['cq']
                             #print(cq_text, flush=True)
@@ -180,10 +239,26 @@ def add_new_eval(
                                 # make sure the similarity of the winning reference sentence is at least 0.65
                                 if sims[winner] > 0.65:
                                     label = references[indx][winner]['label']
-                                    if label == 'Useful':
-                                        score += 1/3
-                            #else:
-                            #    label = 'not_able_to_evaluate'
                 print(indx, score, flush=True)
                 #return format_error(score)

 from scorer import question_scorer
 from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
 TOKEN = os.environ.get("TOKEN", None)
 OWNER="Blanca"
 #CONTACT_DATASET = f"{OWNER}/contact_info"
 RESULTS_DATASET = f"{OWNER}/results_public"
 LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
+METRIC = 'similarity' # 'gemma'
 api = HfApi()
+if METRIC == 'similarity':
+    similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
+if METRIC == 'gemma':
+    model = AutoModelForCausalLM.from_pretrained('google/gemma-2-9b-it', device_map="auto", attn_implementation='eager')
+    tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-9b-it')
 YEAR_VERSION = "2025"
 ref_scores_len = {"test": 34}
 #ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
 TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]
+def run_model(model, tokenizer, prompt):
+    chat = [{"role": "user", "content": prompt}]
+    chat_formated = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+    #print(chat_formated, flush=True)
+    inputs = tokenizer(chat_formated, return_tensors="pt")
+    inputs = inputs.to('cuda')
+    generated_ids = model.generate(**inputs, max_new_tokens=512)
+    #generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)] # this does not work for Gemma
+    out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    #print(out, flush=True)
+    try:
+        output = out.split('model\n')[1].replace('\n', '')
+    except IndexError:
+        print('EVAL ERROR: '+output, flush=True)
+    #import pdb; pdb.set_trace()
+    output = output.strip()
+    return output
+def get_prompts(cq, references):
+    return {
+        'compare': f"""You will be given a set of reference questions, each with an identifying ID, and a newly generated question. Your task is to determine if any of the reference questions are asking for the same information as the new question.
+Here is the set of reference questions with their IDs:
+<reference_questions>
+{references}
+</reference_questions>
+Here is the newly generated question:
+<new_question>
+{cq}
+</new_question>
+Compare the new question to each of the reference questions. Look for questions that are asking for the same information, even if they are worded differently. Consider the core meaning and intent of each question, not just the exact wording.
+If you find a reference question that is asking for the same information as the new question, output only the ID of that reference question.
+If none of the reference questions are asking for the same information as the new question, output exactly 'Similar reference not found.' (without quotes).
+Your final output should consist of only one of the following:
+1. The ID of the most similar reference question
+2. The exact phrase 'Similar reference not found.'
+Do not include any explanation, reasoning, or additional text in your output."""}
 def add_new_eval(
     model: str,
     model_family: str,
                         reference_set = [row['cq'] for row in references[indx]]
                         #print(reference_set, flush=True)
                         for cq in line['cqs']:
                             cq_text = cq['cq']
                             #print(cq_text, flush=True)
                                 # make sure the similarity of the winning reference sentence is at least 0.65
                                 if sims[winner] > 0.65:
                                     label = references[indx][winner]['label']
+                                else:
+                                    label = 'not_able_to_evaluate'
+                            if METRIC == 'gemma':
+                                prompts = get_prompts(cq_text, '\n'.join(reference_set))
+                                winner = run_model(model, tokenizer, prompts['compare'])
+                                try: # here make sure the output is the id of a reference cq
+                                    if winner.strip() != 'Similar reference not found.':
+                                        label = references[index][int(winner)]['label']
+                                    else:
+                                        label = 'not_able_to_evaluate'
+                                except IndexError:
+                                    label = 'evaluation_issue'
+                                except ValueError:
+                                    label = 'evaluation_issue'
+                            print(label, flush=True)
+                            if label == 'Useful':
+                                score += 1/3
                 print(indx, score, flush=True)
                 #return format_error(score)