data-analyst-agent / tools.py
KarthikMuraliM's picture
Fix: Correctly configure sandbox scope for code execution
aa9714c
# tools.py (Index-based Version)
import pandas as pd
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import json
import openai
import pandas as pd
import re
import io
import sys
from contextlib import redirect_stdout
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
import base64
client = None
def set_openai_client(c):
global client
client = c
async def get_dynamic_html(url: str) -> str:
# This function remains the same
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
try:
await page.goto(url, timeout=20000, wait_until='networkidle')
html_content = await page.content()
except Exception as e:
await browser.close()
return f"Error fetching page with Playwright: {e}"
await browser.close()
return html_content
def choose_best_table_from_html(html_content: str, task_description: str) -> str:
"""
Uses an LLM to identify the best table by its INDEX.
Returns a JSON object with the table's index, e.g., {"index": 0}.
"""
soup = BeautifulSoup(html_content, 'lxml')
tables = soup.find_all('table')
if not tables:
return '{"error": "No tables found on the page."}'
table_summaries = []
for i, table in enumerate(tables):
rows = table.find_all('tr')
sample_text = ""
for row in rows[:3]:
cells = row.find_all(['td', 'th'])
sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
table_summaries.append({
"index": i, # Use the index as the identifier
"sample_data": sample_text.strip()
})
system_prompt = """
You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index.
Based on the user's task, your job is to identify the single best table.
Respond with a single JSON object containing the index of the best table, like this: {"index": 1}
"""
user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
try:
completion = client.chat.completions.create(
model="gpt-5-nano",
response_format={"type": "json_object"},
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
)
return completion.choices[0].message.content
except Exception as e:
return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame | str):
"""Extracts a specific table from HTML using its index into a DataFrame."""
soup = BeautifulSoup(html_content, 'lxml')
tables = soup.find_all('table')
if not 0 <= table_index < len(tables):
return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found."
selected_table = tables[table_index]
try:
df_list = pd.read_html(io.StringIO(str(selected_table)))
if not df_list:
return "Error: Pandas could not parse the selected table."
return df_list[0]
except Exception as e:
return f"Error converting table to DataFrame: {e}"
def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
"""
Executes Python code with a DataFrame and common libraries available.
Captures and returns any output printed to stdout.
"""
output_stream = io.StringIO()
# --- THIS IS THE CORRECTED SANDBOX SETUP ---
# Create a single dictionary to serve as the global and local scope.
# This ensures that all libraries are accessible everywhere inside the exec'd code.
execution_scope = {
'df': df,
'pd': pd,
're': re,
'plt': plt,
'sns': sns,
'np': np,
'LinearRegression': LinearRegression,
'io': io,
'base64': base64,
'__builtins__': __builtins__ # Ensure basic built-ins are available
}
try:
with redirect_stdout(output_stream):
# Pass the scope dictionary as the 'globals' argument.
# This makes 'pd', 're', etc. globally available to the script.
exec(python_code, execution_scope)
plt.close('all')
result = output_stream.getvalue()
if not result:
return "Code executed successfully with no printed output."
return result
except Exception as e:
plt.close('all')
return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"