File size: 4,851 Bytes
8de87af
74392da
a523805
cfd667b
 
 
4fda5be
 
8bcc812
 
 
98933b3
 
 
 
 
 
 
 
 
 
 
cfd667b
 
 
 
 
a523805
8de87af
a523805
 
 
cfd667b
a523805
 
cfd667b
a523805
cfd667b
a523805
cfd667b
74392da
cfd667b
74392da
8de87af
 
74392da
cfd667b
 
 
 
 
74392da
cfd667b
 
 
 
8de87af
cfd667b
 
74392da
cfd667b
8de87af
cfd667b
 
 
 
8de87af
 
 
cfd667b
 
74392da
cfd667b
 
8bcc812
cfd667b
8de87af
cfd667b
 
74392da
cfd667b
 
8de87af
 
cfd667b
8de87af
04090ba
8de87af
 
04090ba
8de87af
 
cfd667b
e9b9efe
cfd667b
 
 
 
8bcc812
 
 
 
 
98933b3
8bcc812
 
 
 
aa9714c
 
 
 
4fda5be
 
98933b3
aa9714c
 
 
 
 
 
 
4fda5be
8bcc812
 
 
aa9714c
 
 
8bcc812
98933b3
 
8bcc812
 
 
 
 
 
aa9714c
98933b3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# tools.py (Index-based Version)
import pandas as pd
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import json
import openai
import pandas as pd
import re
import io
import sys
from contextlib import redirect_stdout


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
import base64




client = None
def set_openai_client(c):
    global client
    client = c

async def get_dynamic_html(url: str) -> str:
    # This function remains the same
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        try:
            await page.goto(url, timeout=20000, wait_until='networkidle')
            html_content = await page.content()
        except Exception as e:
            await browser.close()
            return f"Error fetching page with Playwright: {e}"
        await browser.close()
        return html_content

def choose_best_table_from_html(html_content: str, task_description: str) -> str:
    """
    Uses an LLM to identify the best table by its INDEX.
    Returns a JSON object with the table's index, e.g., {"index": 0}.
    """
    soup = BeautifulSoup(html_content, 'lxml')
    tables = soup.find_all('table')

    if not tables:
        return '{"error": "No tables found on the page."}'

    table_summaries = []
    for i, table in enumerate(tables):
        rows = table.find_all('tr')
        sample_text = ""
        for row in rows[:3]:
            cells = row.find_all(['td', 'th'])
            sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
        
        table_summaries.append({
            "index": i, # Use the index as the identifier
            "sample_data": sample_text.strip()
        })
    
    system_prompt = """
    You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index.
    Based on the user's task, your job is to identify the single best table.
    Respond with a single JSON object containing the index of the best table, like this: {"index": 1}
    """
    user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"

    try:
        completion = client.chat.completions.create(
            model="gpt-5-nano",
            response_format={"type": "json_object"},
            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f'{{"error": "LLM error in choosing table: {str(e)}"}}'

def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame | str):
    """Extracts a specific table from HTML using its index into a DataFrame."""
    soup = BeautifulSoup(html_content, 'lxml')
    tables = soup.find_all('table')
    
    if not 0 <= table_index < len(tables):
        return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found."

    selected_table = tables[table_index]
    
    try:
        df_list = pd.read_html(io.StringIO(str(selected_table)))
        if not df_list:
            return "Error: Pandas could not parse the selected table."
        return df_list[0]
    except Exception as e:
        return f"Error converting table to DataFrame: {e}"
    

def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
    """
    Executes Python code with a DataFrame and common libraries available.
    Captures and returns any output printed to stdout.
    """
    output_stream = io.StringIO()
    
    # --- THIS IS THE CORRECTED SANDBOX SETUP ---
    # Create a single dictionary to serve as the global and local scope.
    # This ensures that all libraries are accessible everywhere inside the exec'd code.
    execution_scope = {
        'df': df,
        'pd': pd,
        're': re,
        'plt': plt,
        'sns': sns,
        'np': np,
        'LinearRegression': LinearRegression,
        'io': io,
        'base64': base64,
        '__builtins__': __builtins__ # Ensure basic built-ins are available
    }
    
    try:
        with redirect_stdout(output_stream):
            # Pass the scope dictionary as the 'globals' argument.
            # This makes 'pd', 're', etc. globally available to the script.
            exec(python_code, execution_scope)
        
        plt.close('all')
            
        result = output_stream.getvalue()
        if not result:
            return "Code executed successfully with no printed output."
        return result
        
    except Exception as e:
        plt.close('all')
        return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"