Spaces:
Sleeping
Sleeping
File size: 4,851 Bytes
8de87af 74392da a523805 cfd667b 4fda5be 8bcc812 98933b3 cfd667b a523805 8de87af a523805 cfd667b a523805 cfd667b a523805 cfd667b a523805 cfd667b 74392da cfd667b 74392da 8de87af 74392da cfd667b 74392da cfd667b 8de87af cfd667b 74392da cfd667b 8de87af cfd667b 8de87af cfd667b 74392da cfd667b 8bcc812 cfd667b 8de87af cfd667b 74392da cfd667b 8de87af cfd667b 8de87af 04090ba 8de87af 04090ba 8de87af cfd667b e9b9efe cfd667b 8bcc812 98933b3 8bcc812 aa9714c 4fda5be 98933b3 aa9714c 4fda5be 8bcc812 aa9714c 8bcc812 98933b3 8bcc812 aa9714c 98933b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# tools.py (Index-based Version)
import pandas as pd
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import json
import openai
import pandas as pd
import re
import io
import sys
from contextlib import redirect_stdout
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
import base64
client = None
def set_openai_client(c):
global client
client = c
async def get_dynamic_html(url: str) -> str:
# This function remains the same
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
try:
await page.goto(url, timeout=20000, wait_until='networkidle')
html_content = await page.content()
except Exception as e:
await browser.close()
return f"Error fetching page with Playwright: {e}"
await browser.close()
return html_content
def choose_best_table_from_html(html_content: str, task_description: str) -> str:
"""
Uses an LLM to identify the best table by its INDEX.
Returns a JSON object with the table's index, e.g., {"index": 0}.
"""
soup = BeautifulSoup(html_content, 'lxml')
tables = soup.find_all('table')
if not tables:
return '{"error": "No tables found on the page."}'
table_summaries = []
for i, table in enumerate(tables):
rows = table.find_all('tr')
sample_text = ""
for row in rows[:3]:
cells = row.find_all(['td', 'th'])
sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
table_summaries.append({
"index": i, # Use the index as the identifier
"sample_data": sample_text.strip()
})
system_prompt = """
You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index.
Based on the user's task, your job is to identify the single best table.
Respond with a single JSON object containing the index of the best table, like this: {"index": 1}
"""
user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
try:
completion = client.chat.completions.create(
model="gpt-5-nano",
response_format={"type": "json_object"},
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
)
return completion.choices[0].message.content
except Exception as e:
return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame | str):
"""Extracts a specific table from HTML using its index into a DataFrame."""
soup = BeautifulSoup(html_content, 'lxml')
tables = soup.find_all('table')
if not 0 <= table_index < len(tables):
return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found."
selected_table = tables[table_index]
try:
df_list = pd.read_html(io.StringIO(str(selected_table)))
if not df_list:
return "Error: Pandas could not parse the selected table."
return df_list[0]
except Exception as e:
return f"Error converting table to DataFrame: {e}"
def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
"""
Executes Python code with a DataFrame and common libraries available.
Captures and returns any output printed to stdout.
"""
output_stream = io.StringIO()
# --- THIS IS THE CORRECTED SANDBOX SETUP ---
# Create a single dictionary to serve as the global and local scope.
# This ensures that all libraries are accessible everywhere inside the exec'd code.
execution_scope = {
'df': df,
'pd': pd,
're': re,
'plt': plt,
'sns': sns,
'np': np,
'LinearRegression': LinearRegression,
'io': io,
'base64': base64,
'__builtins__': __builtins__ # Ensure basic built-ins are available
}
try:
with redirect_stdout(output_stream):
# Pass the scope dictionary as the 'globals' argument.
# This makes 'pd', 're', etc. globally available to the script.
exec(python_code, execution_scope)
plt.close('all')
result = output_stream.getvalue()
if not result:
return "Code executed successfully with no printed output."
return result
except Exception as e:
plt.close('all')
return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"
|