Spaces:

farmax
/

MD2JSON

Sleeping

App Files Files Community

farmax commited on 7 days ago

Commit

6f3f570

verified ·

1 Parent(s): 629f6d9

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -98

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Dict, Any, List, Tuple
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForCausalLM, pipeline
 # ================= CONFIGURAZIONE =================
 MODEL_DEBERTA = "osiria/deberta-italian-question-answering"
 MODEL_GEPPETTO = "LorenzoDeMattei/GePpeTto"
@@ -45,50 +44,25 @@ DOMANDE_IT: List[Tuple[str, str]] = [
     ("causale", "Qual è la causale della fattura? / Qual è la motivazione o descrizione del pagamento?")
 ]
-# ================= CACHE MODELLI =================
-LOADED: Dict[str, Any] = {}
-def get_deberta_pipeline():
-    if "deb" in LOADED: return LOADED["deb"]
-    tok = AutoTokenizer.from_pretrained(MODEL_DEBERTA)
-    mdl = AutoModelForQuestionAnswering.from_pretrained(MODEL_DEBERTA)
-    qa = pipeline("question-answering", model=mdl, tokenizer=tok, handle_impossible_answer=True, top_k=1, device=-1)
-    LOADED["deb"] = qa
-    return qa
-def get_geppetto_pipeline():
-    if "gepp" in LOADED: return LOADED["gepp"]
-    tok = AutoTokenizer.from_pretrained(MODEL_GEPPETTO)
-    mdl = AutoModelForCausalLM.from_pretrained(MODEL_GEPPETTO)
-    gen = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
-    LOADED["gepp"] = gen
-    return gen
-# ================= UTILITY =================
 def preprocess_markdown(text: str) -> str:
     if not text: return ""
-    text = re.sub(r'\|[\s-]+\|', ' ', text)   # ripulisce separatori tabella
     text = text.replace('|', ' ')
     text = text.replace('**', '').replace('##', '')
-    # mapping semantico leggero
-    text = text.replace('P.IVA', 'partita IVA').replace('PIVA', 'partita IVA')
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-def chunk_text(text: str, max_chars: int = 3000, overlap: int = 200) -> List[str]:
-    if len(text) <= max_chars: return [text]
-    chunks = []
-    i = 0
-    while i < len(text):
-        end = min(i + max_chars, len(text))
-        chunks.append(text[i:end])
-        i = end - overlap
-        if i < 0: i = 0
-    return chunks
-# ================= LOGICA PRINCIPALE =================
 def analyze_invoice(md_text: str, custom_question_it: str):
     logs: List[str] = []
     final_output: Dict[str, Any] = {}
@@ -97,86 +71,72 @@ def analyze_invoice(md_text: str, custom_question_it: str):
         return {"Error": "Testo troppo breve"}, "⚠️ Inserisci almeno 10 caratteri."
     clean_text = preprocess_markdown(md_text)
-    chunks = chunk_text(clean_text, max_chars=3000, overlap=200)
-    logs.append(f"📄 Testo originale: {len(md_text)} chars | Pulito: {len(clean_text)} chars | Chunks: {len(chunks)}")
-    qa_deb = get_deberta_pipeline()
-    gen_gepp = get_geppetto_pipeline()
     # 1) DeBERTa: QA estrattivo su tutte le domande + opzionale
     t_start_deb = time.time()
     deb_res: Dict[str, Any] = {}
     success_count = 0
-    def ask_all_chunks(question: str) -> Tuple[str, float]:
-        best_answer, best_score = "", 0.0
-        for c in chunks:
-            try:
-                r = qa_deb(question=question, context=c)
-                ans = r.get("answer", "").strip()
-                score = float(r.get("score", 0.0))
-                if score > best_score and ans:
-                    best_answer, best_score = ans, score
-            except Exception as e:
-                logs.append(f"❌ Errore QA chunk: {str(e)}")
-        return best_answer, best_score
     for key, question_text in DOMANDE_IT:
-        answer, score = ask_all_chunks(question_text)
-        status = "Successo" if score > 0.05 and answer else "Non Trovato"
-        if status == "Successo": success_count += 1
-        deb_res[key] = {
-            "domanda": question_text,
-            "risposta": answer,
-            "confidenza": round(score, 3),
-            "status": status
-        }
     custom_q = custom_question_it.strip()
     if custom_q:
-        answer, score = ask_all_chunks(custom_q)
-        status = "Successo" if score > 0.05 and answer else "Non Trovato"
-        if status == "Successo": success_count += 1
-        deb_res["domanda_opzionale"] = {
-            "domanda": custom_q,
-            "risposta": answer,
-            "confidenza": round(score, 3),
-            "status": status
-        }
     t_elapsed_deb = round(time.time() - t_start_deb, 2)
     final_output["DeBERTa (estrattivo)"] = deb_res
     logs.append(f"✅ DeBERTa completato in {t_elapsed_deb}s | Successi: {success_count}/{len(DOMANDE_IT) + (1 if custom_q else 0)}")
-    # 2) GePpeTto: generativo su tutte le domande in blocco
-    t_start_gepp = time.time()
-    try:
-        # Costruzione prompt conciso per ridurre rumore
-        prompt_lines = ["Rispondi in elenco puntato alle seguenti domande sulla fattura:"]
-        for _, q in DOMANDE_IT:
-            prompt_lines.append(f"- {q}")
-        if custom_q:
-            prompt_lines.append(f"- {custom_q}")
-        prompt_lines.append("\nContesto:")
-        prompt_lines.append(clean_text[:4000])  # taglio prudenziale su CPU
-        prompt_lines.append("\nRisposte (usa un punto per ogni domanda, senza inventare dati):")
-        prompt = "\n".join(prompt_lines)
-        gen = gen_gepp(prompt, max_new_tokens=256, do_sample=False)
-        generative_text = gen[0]["generated_text"].replace(prompt, "").strip()
-        final_output["GePpeTto (generativo)"] = {"risposte": generative_text}
-        t_elapsed_gepp = round(time.time() - t_start_gepp, 2)
-        logs.append(f"✅ GePpeTto completato in {t_elapsed_gepp}s")
-    except Exception as e:
-        final_output["GePpeTto (generativo)"] = {"errore": str(e)}
-        logs.append(f"❌ Errore GePpeTto: {e}")
     return final_output, "\n".join(logs)
 # ================== UI GRADIO ==================
 with gr.Blocks(theme=gr.themes.Base()) as demo:
-    gr.Markdown("# 🧾 Invoice QA: Domande standard + opzionale (DeBERTa estrattivo & GePpeTto generativo)")
-    gr.Markdown("Risposte estrattive strutturate per tutte le domande e un blocco generativo riassuntivo, con log e tempi.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -194,11 +154,11 @@ with gr.Blocks(theme=gr.themes.Base()) as demo:
             btn = gr.Button("🔍 Analizza documento", variant="primary")
         with gr.Column(scale=1):
-            out_json = gr.JSON(label="Risultati estrattivi (DeBERTa) e generativi (GePpeTto)")
             with gr.Accordion("📝 Log di Sistema (Tempi e Debug)", open=False):
                 out_log = gr.Textbox(label="Process Log", lines=12)
     btn.click(fn=analyze_invoice, inputs=[md_input, custom_q_input], outputs=[out_json, out_log])
 if __name__ == "__main__":
-    demo.launch()

 from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForCausalLM, pipeline
 # ================= CONFIGURAZIONE =================
 MODEL_DEBERTA = "osiria/deberta-italian-question-answering"
 MODEL_GEPPETTO = "LorenzoDeMattei/GePpeTto"
     ("causale", "Qual è la causale della fattura? / Qual è la motivazione o descrizione del pagamento?")
 ]
+# ================= PIPELINES =================
+tok_deb = AutoTokenizer.from_pretrained(MODEL_DEBERTA)
+mdl_deb = AutoModelForQuestionAnswering.from_pretrained(MODEL_DEBERTA)
+qa_deb = pipeline("question-answering", model=mdl_deb, tokenizer=tok_deb, device=-1)
+tok_gepp = AutoTokenizer.from_pretrained(MODEL_GEPPETTO)
+mdl_gepp = AutoModelForCausalLM.from_pretrained(MODEL_GEPPETTO)
+qa_gepp = pipeline("text-generation", model=mdl_gepp, tokenizer=tok_gepp, device=-1)
+# ================= UTILITY =================
 def preprocess_markdown(text: str) -> str:
     if not text: return ""
+    text = re.sub(r'\|[\s-]+\|', ' ', text)
     text = text.replace('|', ' ')
     text = text.replace('**', '').replace('##', '')
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+# ================= FUNZIONE PRINCIPALE =================
 def analyze_invoice(md_text: str, custom_question_it: str):
     logs: List[str] = []
     final_output: Dict[str, Any] = {}
         return {"Error": "Testo troppo breve"}, "⚠️ Inserisci almeno 10 caratteri."
     clean_text = preprocess_markdown(md_text)
     # 1) DeBERTa: QA estrattivo su tutte le domande + opzionale
     t_start_deb = time.time()
     deb_res: Dict[str, Any] = {}
     success_count = 0
     for key, question_text in DOMANDE_IT:
+        try:
+            res = qa_deb(question=question_text, context=clean_text)
+            answer = res["answer"].strip()
+            score = round(res.get("score", 0.0), 3)
+            status = "Successo" if score > 0.05 and answer else "Non Trovato"
+            if status == "Successo": success_count += 1
+            deb_res[key] = {
+                "domanda": question_text,
+                "risposta": answer,
+                "confidenza": score,
+                "status": status
+            }
+        except Exception as e:
+            deb_res[key] = {"status": f"Errore inferenza: {str(e)}"}
     custom_q = custom_question_it.strip()
     if custom_q:
+        try:
+            res = qa_deb(question=custom_q, context=clean_text)
+            answer = res["answer"].strip()
+            score = round(res.get("score", 0.0), 3)
+            status = "Successo" if score > 0.05 and answer else "Non Trovato"
+            if status == "Successo": success_count += 1
+            deb_res["domanda_opzionale"] = {
+                "domanda": custom_q,
+                "risposta": answer,
+                "confidenza": score,
+                "status": status
+            }
+        except Exception as e:
+            deb_res["domanda_opzionale"] = {"status": f"Errore inferenza: {str(e)}"}
     t_elapsed_deb = round(time.time() - t_start_deb, 2)
     final_output["DeBERTa (estrattivo)"] = deb_res
     logs.append(f"✅ DeBERTa completato in {t_elapsed_deb}s | Successi: {success_count}/{len(DOMANDE_IT) + (1 if custom_q else 0)}")
+    # 2) GePpeTto: SOLO domanda opzionale
+    if custom_q:
+        try:
+            t_start_gepp = time.time()
+            short_context = clean_text[:800]  # taglio prudenziale
+            prompt = f"Domanda: {custom_q}\nContesto: {short_context}\nRisposta:"
+            res_gepp = qa_gepp(prompt, max_new_tokens=64, do_sample=False)
+            generative_text = res_gepp[0]["generated_text"].replace(prompt, "").strip()
+            final_output["GePpeTto (generativo)"] = {"risposta_opzionale": generative_text}
+            t_elapsed_gepp = round(time.time() - t_start_gepp, 2)
+            logs.append(f"✅ GePpeTto completato in {t_elapsed_gepp}s")
+        except Exception as e:
+            final_output["GePpeTto (generativo)"] = {"errore": str(e)}
+            logs.append(f"❌ Errore GePpeTto: {e}")
+    else:
+        final_output["GePpeTto (generativo)"] = {"info": "Nessuna domanda opzionale fornita"}
     return final_output, "\n".join(logs)
 # ================== UI GRADIO ==================
 with gr.Blocks(theme=gr.themes.Base()) as demo:
+    gr.Markdown("# 🧾 Invoice QA: Domande standard + opzionale")
+    gr.Markdown("Risposte estrattive (DeBERTa) su tutte le domande e generative (GePpeTto) solo sulla domanda opzionale.")
     with gr.Row():
         with gr.Column(scale=1):
             btn = gr.Button("🔍 Analizza documento", variant="primary")
         with gr.Column(scale=1):
+            out_json = gr.JSON(label="Risultati (DeBERTa estrattivo + GePpeTto opzionale)")
             with gr.Accordion("📝 Log di Sistema (Tempi e Debug)", open=False):
                 out_log = gr.Textbox(label="Process Log", lines=12)
     btn.click(fn=analyze_invoice, inputs=[md_input, custom_q_input], outputs=[out_json, out_log])
 if __name__ == "__main__":
+    demo.launch()