Spaces:

ibombonato
/

Semantic-search-br

Sleeping

App Files Files Community

Semantic-search-br / app.py

ibombonato

Upload folder using huggingface_hub

82fdd67 verified 5 months ago

raw

history blame contribute delete

6.31 kB


	import gradio as gr
	import chromadb
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import re
	import numpy

	# --- 1. SETUP MODELS AND DATABASE ---

	print("Loading embedding model...")
	#embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
	embedding_model = SentenceTransformer('jmbrito/ptbr-similarity-e5-small')

	client = chromadb.Client()
	collection = client.get_or_create_collection(
	name="transcript_comparison_app",
	metadata={"hnsw:space": "cosine"}
	)
	print("ChromaDB collection ready.")


	# --- 2. NEW: DEFINE AND PRE-COMPUTE GUIDELINE PROFILES ---

	# Define the positive phrases for each guideline you want to test
	GUIDELINE_PROFILES = {
	"Agent Empathy": [
	"Sinto muito por esse transtorno.",
	"Eu entendo completamente sua frustração.",
	"Imagino como isso deve ser chato, vamos resolver.",
	"Lamento que você tenha passado por isso.",
	"Compreendo sua situação e peço desculpas pelo ocorrido."
	],
	"Problem Resolution Offer": [
	"Para resolver isso, posso te oferecer duas opções.",
	"Temos algumas alternativas para solucionar seu problema.",
	"A solução que posso propor é a seguinte.",
	"Vamos encontrar uma forma de resolver isso para você."
	],
	"Polite Closing": [
	"Obrigado por sua ligação, tenha um ótimo dia.",
	"Agradecemos seu contato.",
	"Se precisar de mais alguma coisa, é só ligar.",
	"Tenha uma excelente semana."
	]
	}

	# Pre-compute the averaged profile embeddings when the app starts
	print("Computing guideline profile embeddings...")
	profile_embeddings = {}
	for guideline_name, phrases in GUIDELINE_PROFILES.items():
	phrase_embeddings = embedding_model.encode(phrases)
	profile_embeddings[guideline_name] = numpy.mean(phrase_embeddings, axis=0)
	print("✅ Guideline profiles are ready.")


	# --- 3. CORE FUNCTIONS ---

	def index_transcript(transcript_text):
	# This function remains the same
	if not transcript_text.strip():
	return "Please paste a transcript before indexing.", pd.DataFrame()
	chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
	chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
	ids = [f"chunk_{i}" for i in range(len(chunks))]
	if collection.count() > 0:
	collection.delete(ids=collection.get()['ids'])
	collection.add(embeddings=embedding_model.encode(chunks).tolist(), documents=chunks, ids=ids)
	indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
	return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df

	def search_with_single_query(query):
	# This is the original search method
	if not query.strip():
	return pd.DataFrame()
	query_embedding = embedding_model.encode(query).tolist()
	results = collection.query(query_embeddings=[query_embedding], n_results=3)
	documents = results['documents'][0]
	distances = results['distances'][0]
	similarities = [f"{1 - dist:.2f}" for dist in distances]
	return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})

	def search_with_profile(guideline_name):
	# This is the NEW search method using the pre-computed profiles
	if not guideline_name:
	return pd.DataFrame()
	profile_embedding = profile_embeddings[guideline_name].tolist()
	results = collection.query(query_embeddings=[profile_embedding], n_results=3)
	documents = results['documents'][0]
	distances = results['distances'][0]
	similarities = [f"{1 - dist:.2f}" for dist in distances]
	return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})


	# --- 4. GRADIO INTERFACE FOR COMPARISON ---

	sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
	Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
	Atendente: Puxa, que chato isso. Lamento que você tenha passado por isso. Pode me informar o número do pedido para eu localizar sua compra?
	Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
	Atendente: Entendo perfeitamente sua frustração. Para resolver isso, posso te oferecer duas opções.
	Cliente: Prefiro receber um novo.
	Atendente: Combinado. Obrigado por sua ligação, tenha um ótimo dia.
	"""

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🔍 Search Method Comparison")
	gr.Markdown("Index a transcript once, then search using both methods to compare the results.")

	with gr.Row():
	# Indexing column is the same
	with gr.Column(scale=1):
	transcript_input = gr.Textbox(lines=15, label="Paste Transcript Here", value=sample_transcript)
	index_button = gr.Button("Index Transcript", variant="primary")
	index_status = gr.Label()
	indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview")

	gr.HTML("<hr>") # Add a horizontal line for separation

	with gr.Row():
	# Column for the simple, single query search
	with gr.Column():
	gr.Markdown("### Method 1: Single Query Search")
	query_input = gr.Textbox(label="Enter a Simple Query", placeholder="Ex: o agente foi empático?")
	search_button_single = gr.Button("Search Single Query")
	results_output_single = gr.DataFrame(label="Single Query Results")

	# Column for the new, profile-based search
	with gr.Column():
	gr.Markdown("### Method 2: Guideline Profile Search")
	profile_input = gr.Dropdown(choices=list(GUIDELINE_PROFILES.keys()), label="Select a Guideline Profile")
	search_button_profile = gr.Button("Search with Profile", variant="primary")
	results_output_profile = gr.DataFrame(label="Profile Search Results")

	# Wire up the components
	index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
	search_button_single.click(fn=search_with_single_query, inputs=[query_input], outputs=[results_output_single])
	search_button_profile.click(fn=search_with_profile, inputs=[profile_input], outputs=[results_output_profile])

	demo.launch()