Spaces:

Karthix1
/

data-analyst-agent

Sleeping

App Files Files Community

data-analyst-agent / tools.py

KarthikMuraliM

Fix: Correctly configure sandbox scope for code execution

aa9714c 4 months ago

raw

history blame contribute delete

4.85 kB

	# tools.py (Index-based Version)
	import pandas as pd
	from playwright.async_api import async_playwright
	from bs4 import BeautifulSoup
	import json
	import openai
	import pandas as pd
	import re
	import io
	import sys
	from contextlib import redirect_stdout


	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.linear_model import LinearRegression
	import numpy as np
	import base64




	client = None
	def set_openai_client(c):
	global client
	client = c

	async def get_dynamic_html(url: str) -> str:
	# This function remains the same
	async with async_playwright() as p:
	browser = await p.chromium.launch()
	page = await browser.new_page()
	try:
	await page.goto(url, timeout=20000, wait_until='networkidle')
	html_content = await page.content()
	except Exception as e:
	await browser.close()
	return f"Error fetching page with Playwright: {e}"
	await browser.close()
	return html_content

	def choose_best_table_from_html(html_content: str, task_description: str) -> str:
	"""
	Uses an LLM to identify the best table by its INDEX.
	Returns a JSON object with the table's index, e.g., {"index": 0}.
	"""
	soup = BeautifulSoup(html_content, 'lxml')
	tables = soup.find_all('table')

	if not tables:
	return '{"error": "No tables found on the page."}'

	table_summaries = []
	for i, table in enumerate(tables):
	rows = table.find_all('tr')
	sample_text = ""
	for row in rows[:3]:
	cells = row.find_all(['td', 'th'])
	sample_text += " \| ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"

	table_summaries.append({
	"index": i, # Use the index as the identifier
	"sample_data": sample_text.strip()
	})

	system_prompt = """
	You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index.
	Based on the user's task, your job is to identify the single best table.
	Respond with a single JSON object containing the index of the best table, like this: {"index": 1}
	"""
	user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"

	try:
	completion = client.chat.completions.create(
	model="gpt-5-nano",
	response_format={"type": "json_object"},
	messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
	)
	return completion.choices[0].message.content
	except Exception as e:
	return f'{{"error": "LLM error in choosing table: {str(e)}"}}'

	def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame \| str):
	"""Extracts a specific table from HTML using its index into a DataFrame."""
	soup = BeautifulSoup(html_content, 'lxml')
	tables = soup.find_all('table')

	if not 0 <= table_index < len(tables):
	return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found."

	selected_table = tables[table_index]

	try:
	df_list = pd.read_html(io.StringIO(str(selected_table)))
	if not df_list:
	return "Error: Pandas could not parse the selected table."
	return df_list[0]
	except Exception as e:
	return f"Error converting table to DataFrame: {e}"


	def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
	"""
	Executes Python code with a DataFrame and common libraries available.
	Captures and returns any output printed to stdout.
	"""
	output_stream = io.StringIO()

	# --- THIS IS THE CORRECTED SANDBOX SETUP ---
	# Create a single dictionary to serve as the global and local scope.
	# This ensures that all libraries are accessible everywhere inside the exec'd code.
	execution_scope = {
	'df': df,
	'pd': pd,
	're': re,
	'plt': plt,
	'sns': sns,
	'np': np,
	'LinearRegression': LinearRegression,
	'io': io,
	'base64': base64,
	'__builtins__': __builtins__ # Ensure basic built-ins are available
	}

	try:
	with redirect_stdout(output_stream):
	# Pass the scope dictionary as the 'globals' argument.
	# This makes 'pd', 're', etc. globally available to the script.
	exec(python_code, execution_scope)

	plt.close('all')

	result = output_stream.getvalue()
	if not result:
	return "Code executed successfully with no printed output."
	return result

	except Exception as e:
	plt.close('all')
	return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"