Spaces:
Sleeping
Sleeping
| # tools.py (Index-based Version) | |
| import pandas as pd | |
| from playwright.async_api import async_playwright | |
| from bs4 import BeautifulSoup | |
| import json | |
| import openai | |
| import pandas as pd | |
| import re | |
| import io | |
| import sys | |
| from contextlib import redirect_stdout | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.linear_model import LinearRegression | |
| import numpy as np | |
| import base64 | |
| client = None | |
| def set_openai_client(c): | |
| global client | |
| client = c | |
| async def get_dynamic_html(url: str) -> str: | |
| # This function remains the same | |
| async with async_playwright() as p: | |
| browser = await p.chromium.launch() | |
| page = await browser.new_page() | |
| try: | |
| await page.goto(url, timeout=20000, wait_until='networkidle') | |
| html_content = await page.content() | |
| except Exception as e: | |
| await browser.close() | |
| return f"Error fetching page with Playwright: {e}" | |
| await browser.close() | |
| return html_content | |
| def choose_best_table_from_html(html_content: str, task_description: str) -> str: | |
| """ | |
| Uses an LLM to identify the best table by its INDEX. | |
| Returns a JSON object with the table's index, e.g., {"index": 0}. | |
| """ | |
| soup = BeautifulSoup(html_content, 'lxml') | |
| tables = soup.find_all('table') | |
| if not tables: | |
| return '{"error": "No tables found on the page."}' | |
| table_summaries = [] | |
| for i, table in enumerate(tables): | |
| rows = table.find_all('tr') | |
| sample_text = "" | |
| for row in rows[:3]: | |
| cells = row.find_all(['td', 'th']) | |
| sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n" | |
| table_summaries.append({ | |
| "index": i, # Use the index as the identifier | |
| "sample_data": sample_text.strip() | |
| }) | |
| system_prompt = """ | |
| You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index. | |
| Based on the user's task, your job is to identify the single best table. | |
| Respond with a single JSON object containing the index of the best table, like this: {"index": 1} | |
| """ | |
| user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}" | |
| try: | |
| completion = client.chat.completions.create( | |
| model="gpt-5-nano", | |
| response_format={"type": "json_object"}, | |
| messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] | |
| ) | |
| return completion.choices[0].message.content | |
| except Exception as e: | |
| return f'{{"error": "LLM error in choosing table: {str(e)}"}}' | |
| def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame | str): | |
| """Extracts a specific table from HTML using its index into a DataFrame.""" | |
| soup = BeautifulSoup(html_content, 'lxml') | |
| tables = soup.find_all('table') | |
| if not 0 <= table_index < len(tables): | |
| return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found." | |
| selected_table = tables[table_index] | |
| try: | |
| df_list = pd.read_html(io.StringIO(str(selected_table))) | |
| if not df_list: | |
| return "Error: Pandas could not parse the selected table." | |
| return df_list[0] | |
| except Exception as e: | |
| return f"Error converting table to DataFrame: {e}" | |
| def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str: | |
| """ | |
| Executes Python code with a DataFrame and common libraries available. | |
| Captures and returns any output printed to stdout. | |
| """ | |
| output_stream = io.StringIO() | |
| # --- THIS IS THE CORRECTED SANDBOX SETUP --- | |
| # Create a single dictionary to serve as the global and local scope. | |
| # This ensures that all libraries are accessible everywhere inside the exec'd code. | |
| execution_scope = { | |
| 'df': df, | |
| 'pd': pd, | |
| 're': re, | |
| 'plt': plt, | |
| 'sns': sns, | |
| 'np': np, | |
| 'LinearRegression': LinearRegression, | |
| 'io': io, | |
| 'base64': base64, | |
| '__builtins__': __builtins__ # Ensure basic built-ins are available | |
| } | |
| try: | |
| with redirect_stdout(output_stream): | |
| # Pass the scope dictionary as the 'globals' argument. | |
| # This makes 'pd', 're', etc. globally available to the script. | |
| exec(python_code, execution_scope) | |
| plt.close('all') | |
| result = output_stream.getvalue() | |
| if not result: | |
| return "Code executed successfully with no printed output." | |
| return result | |
| except Exception as e: | |
| plt.close('all') | |
| return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}" | |