Spaces:

ibombonato
/

Semantic-search-br

Sleeping

File size: 6,307 Bytes

a793867
0af955d
 
 
 
d44b7ca
 
0af955d
 
3f0ee67
0af955d
82fdd67
 
0af955d
 
 
d44b7ca
0af955d
 
 
 
d44b7ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0af955d
d44b7ca
0af955d
 
caf6350
3f0ee67
0af955d
 
 
d44b7ca
0af955d
 
 
d44b7ca
 
0af955d
d44b7ca
 
 
 
 
 
 
 
 
 
 
 
 
 
0af955d
 
 
d44b7ca
 
0af955d
d44b7ca
 
 
 
 
 
 
 
 
 
 
 
 
 
0af955d
 
d44b7ca
0af955d
d44b7ca
0af955d
d44b7ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0af955d
d44b7ca
 
0af955d


import gradio as gr
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import numpy

# --- 1. SETUP MODELS AND DATABASE ---

print("Loading embedding model...")
#embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
embedding_model = SentenceTransformer('jmbrito/ptbr-similarity-e5-small')

client = chromadb.Client()
collection = client.get_or_create_collection(
    name="transcript_comparison_app",
    metadata={"hnsw:space": "cosine"}
)
print("ChromaDB collection ready.")


# --- 2. NEW: DEFINE AND PRE-COMPUTE GUIDELINE PROFILES ---

# Define the positive phrases for each guideline you want to test
GUIDELINE_PROFILES = {
    "Agent Empathy": [
        "Sinto muito por esse transtorno.",
        "Eu entendo completamente sua frustração.",
        "Imagino como isso deve ser chato, vamos resolver.",
        "Lamento que você tenha passado por isso.",
        "Compreendo sua situação e peço desculpas pelo ocorrido."
    ],
    "Problem Resolution Offer": [
        "Para resolver isso, posso te oferecer duas opções.",
        "Temos algumas alternativas para solucionar seu problema.",
        "A solução que posso propor é a seguinte.",
        "Vamos encontrar uma forma de resolver isso para você."
    ],
    "Polite Closing": [
        "Obrigado por sua ligação, tenha um ótimo dia.",
        "Agradecemos seu contato.",
        "Se precisar de mais alguma coisa, é só ligar.",
        "Tenha uma excelente semana."
    ]
}

# Pre-compute the averaged profile embeddings when the app starts
print("Computing guideline profile embeddings...")
profile_embeddings = {}
for guideline_name, phrases in GUIDELINE_PROFILES.items():
    phrase_embeddings = embedding_model.encode(phrases)
    profile_embeddings[guideline_name] = numpy.mean(phrase_embeddings, axis=0)
print("✅ Guideline profiles are ready.")


# --- 3. CORE FUNCTIONS ---

def index_transcript(transcript_text):
    # This function remains the same
    if not transcript_text.strip():
        return "Please paste a transcript before indexing.", pd.DataFrame()
    chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
    chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
    ids = [f"chunk_{i}" for i in range(len(chunks))]
    if collection.count() > 0:
        collection.delete(ids=collection.get()['ids'])
    collection.add(embeddings=embedding_model.encode(chunks).tolist(), documents=chunks, ids=ids)
    indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
    return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df

def search_with_single_query(query):
    # This is the original search method
    if not query.strip():
        return pd.DataFrame()
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=3)
    documents = results['documents'][0]
    distances = results['distances'][0]
    similarities = [f"{1 - dist:.2f}" for dist in distances]
    return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})

def search_with_profile(guideline_name):
    # This is the NEW search method using the pre-computed profiles
    if not guideline_name:
        return pd.DataFrame()
    profile_embedding = profile_embeddings[guideline_name].tolist()
    results = collection.query(query_embeddings=[profile_embedding], n_results=3)
    documents = results['documents'][0]
    distances = results['distances'][0]
    similarities = [f"{1 - dist:.2f}" for dist in distances]
    return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})


# --- 4. GRADIO INTERFACE FOR COMPARISON ---

sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
Atendente: Puxa, que chato isso. Lamento que você tenha passado por isso. Pode me informar o número do pedido para eu localizar sua compra?
Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
Atendente: Entendo perfeitamente sua frustração. Para resolver isso, posso te oferecer duas opções.
Cliente: Prefiro receber um novo.
Atendente: Combinado. Obrigado por sua ligação, tenha um ótimo dia.
"""

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🔍 Search Method Comparison")
    gr.Markdown("Index a transcript once, then search using both methods to compare the results.")

    with gr.Row():
        # Indexing column is the same
        with gr.Column(scale=1):
            transcript_input = gr.Textbox(lines=15, label="Paste Transcript Here", value=sample_transcript)
            index_button = gr.Button("Index Transcript", variant="primary")
            index_status = gr.Label()
            indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview")

    gr.HTML("<hr>") # Add a horizontal line for separation

    with gr.Row():
        # Column for the simple, single query search
        with gr.Column():
            gr.Markdown("### Method 1: Single Query Search")
            query_input = gr.Textbox(label="Enter a Simple Query", placeholder="Ex: o agente foi empático?")
            search_button_single = gr.Button("Search Single Query")
            results_output_single = gr.DataFrame(label="Single Query Results")

        # Column for the new, profile-based search
        with gr.Column():
            gr.Markdown("### Method 2: Guideline Profile Search")
            profile_input = gr.Dropdown(choices=list(GUIDELINE_PROFILES.keys()), label="Select a Guideline Profile")
            search_button_profile = gr.Button("Search with Profile", variant="primary")
            results_output_profile = gr.DataFrame(label="Profile Search Results")

    # Wire up the components
    index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
    search_button_single.click(fn=search_with_single_query, inputs=[query_input], outputs=[results_output_single])
    search_button_profile.click(fn=search_with_profile, inputs=[profile_input], outputs=[results_output_profile])

demo.launch()