Spaces:

Karthix1
/

data-analyst-agent

Sleeping

App Files Files Community

KarthikMuraliM commited on Sep 4

Commit

cfd667b

1 Parent(s): 04090ba

Implement dynamic, agentic scraping with Playwright

Browse files

Files changed (4) hide show

Dockerfile +6 -4
app.py +48 -41
requirements.txt +1 -1
tools.py +86 -39

Dockerfile CHANGED Viewed

@@ -1,5 +1,6 @@
-# Use an official Python runtime as a parent image
-FROM python:3.10-slim
 # Set the working directory in the container
 WORKDIR /app
@@ -7,12 +8,13 @@ WORKDIR /app
 # Copy the current directory contents into the container at /app
 COPY . /app
-# Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Make port 7860 available to the world outside this container
 EXPOSE 7860
 # Run the uvicorn server when the container launches
-# Hugging Face Spaces automatically maps this to your public URL
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use the official Playwright Docker image as a base for simplicity and reliability.
+# It comes with all necessary system dependencies pre-installed.
+FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
 # Set the working directory in the container
 WORKDIR /app
 # Copy the current directory contents into the container at /app
 COPY . /app
+# Install the Python packages specified in our requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
+# No need to run 'playwright install' as the base image already has browsers.
 # Make port 7860 available to the world outside this container
 EXPOSE 7860
 # Run the uvicorn server when the container launches
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,16 +1,16 @@
 # app.py
 from fastapi import FastAPI, File, UploadFile, Form
 from typing import List
-import os
 import openai
 import json
-# Import our new tool
-from tools import scrape_url_to_dataframe
 app = FastAPI()
 client = openai.OpenAI()
 @app.get("/")
 async def read_root():
@@ -22,49 +22,56 @@ async def analyze_data(
     files: List[UploadFile] = File([], alias="files"),
 ):
     questions_text = (await questions_file.read()).decode("utf-8")
-    # --- LLM Decides Which Tool to Use ---
-    # We will use a more advanced agent framework later.
-    # For now, a simple keyword check is enough to demonstrate the concept.
     if "scrape" in questions_text.lower() and "http" in questions_text.lower():
-        # This is a scraping task. Let's find the URL.
-        url = None
-        for word in questions_text.split():
-            if word.startswith("http"):
-                url = word
-                break
         if not url:
-            return {"error": "Scraping task detected, but no URL found in the question."}
-        # Call our scraping tool
-        scraped_data = scrape_url_to_dataframe(url)
-        # Check if the tool returned a DataFrame or an error string
-        if isinstance(scraped_data, str):
-            # The tool returned an error
-            return {"error": scraped_data}
-        # For now, just return the first 5 rows of the DataFrame as JSON
-        # In the next step, the LLM will analyze this data.
-        return {
-            "status": "Scraping complete",
-            "url": url,
-            "data_preview": json.loads(scraped_data.head().to_json(orient="records"))
-        }
-    else:
-        # This is a general knowledge task, same as before.
         try:
-            completion = client.chat.completions.create(
-                model="gpt-4o",
-                messages=[
-                    {"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "user", "content": questions_text}
-                ]
-            )
-            llm_response = completion.choices[0].message.content
-            return {"status": "LLM query complete", "response": llm_response}
         except Exception as e:
-            return {"error": f"Error calling LLM: {e}"}

 # app.py
 from fastapi import FastAPI, File, UploadFile, Form
 from typing import List
 import openai
 import json
+import pandas as pd
+# Import our new set of tools
+import tools
 app = FastAPI()
 client = openai.OpenAI()
+tools.set_openai_client(client) # Give the tools module access to the client
 @app.get("/")
 async def read_root():
     files: List[UploadFile] = File([], alias="files"),
 ):
     questions_text = (await questions_file.read()).decode("utf-8")
     if "scrape" in questions_text.lower() and "http" in questions_text.lower():
+        url = next((word for word in questions_text.split() if word.startswith("http")), None)
         if not url:
+            return {"error": "Scraping task detected, but no URL was found."}
+        # --- AGENT WORKFLOW ---
+        # 1. PERCEIVE: Get the full page content
+        print(f"Step 1: Fetching dynamic HTML from {url}")
+        html_content = tools.get_dynamic_html(url)
+        if "Error" in html_content:
+            return {"error": html_content}
+        # 2. DECIDE: Ask LLM to choose the best table for the task
+        print("Step 2: Asking LLM to choose the best table.")
+        task_description = f"Find a table with the following information: {questions_text}"
+        choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
+        try:
+            choice = json.loads(choice_json_str)
+            if "error" in choice:
+                return {"error": choice["error"]}
+            selector = choice.get("selector")
+            if not selector:
+                return {"error": "LLM failed to return a valid selector."}
+        except json.JSONDecodeError:
+            return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
+        # 3. ACT: Extract the chosen table into a DataFrame
+        print(f"Step 3: Extracting table with selector '{selector}'.")
+        df_or_error = tools.extract_table_to_dataframe(html_content, selector)
+        if isinstance(df_or_error, str):
+            return {"error": df_or_error}
+        # --- ANALYSIS (same as before) ---
+        print("Step 4: Analyzing data with LLM.")
+        data_string = df_or_error.to_csv(index=False)
+        if len(data_string) > 15000:
+            data_string = df_or_error.head(50).to_csv(index=False)
+        system_prompt = "You are an expert data analyst... respond with a JSON object: {\"answers\": [...]}" # (Same prompt as before)
+        user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
         try:
+            completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
+            response_data = json.loads(completion.choices[0].message.content)
+            return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
         except Exception as e:
+            return {"error": f"Error during final analysis: {str(e)}"}
+    else:
+        # Handle non-scraping tasks here
+        return {"response": "This is a non-scraping task."}

requirements.txt CHANGED Viewed

@@ -7,4 +7,4 @@ beautifulsoup4    # Add this line for parsing HTML
 lxml              # Add this line, it's a fast parser for BeautifulSoup
 pandas
 numpy
-# playwright

 lxml              # Add this line, it's a fast parser for BeautifulSoup
 pandas
 numpy
+playwright

tools.py CHANGED Viewed

@@ -1,51 +1,98 @@
 # tools.py
-import requests
-from bs4 import BeautifulSoup
 import pandas as pd
-def scrape_url_to_dataframe(url: str) -> (pd.DataFrame | str):
     """
-    Scrapes a given URL for the first HTML table and returns it as a pandas DataFrame.
-    If no table is found or an error occurs, it returns an error message string.
     """
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
-        soup = BeautifulSoup(response.content, 'lxml')
-        # Find the first table in the HTML. Wikipedia pages often have the main data here.
-        table = soup.find('table', {'class': 'wikitable'})
-        if not table:
-            return "Error: No table with class 'wikitable' found on the page."
-        # Use pandas to read the HTML table directly into a DataFrame
-        # read_html returns a list of DataFrames, we want the first one.
-        df_list = pd.read_html(str(table))
-        if not df_list:
-            return "Error: Pandas could not parse any tables from the HTML."
-        df = df_list[0]
-        return df
-    except requests.exceptions.RequestException as e:
-        return f"Error fetching URL: {e}"
     except Exception as e:
-        return f"An unexpected error occurred: {e}"
-# from playwright.sync_api import sync_playwright
-# def scrape_dynamic_url(url: str) -> str:
-#     """Scrapes a dynamic URL using Playwright and returns the final HTML."""
-#     with sync_playwright() as p:
-#         browser = p.chromium.launch()
-#         page = browser.new_page()
-#         page.goto(url, wait_until='networkidle') # Wait for network activity to cease
-#         html_content = page.content()
-#         browser.close()
-#         return html_content

 # tools.py
 import pandas as pd
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+import json
+import openai
+# Use the client initialized in the main app
+client = None
+def set_openai_client(c):
+    global client
+    client = c
+def get_dynamic_html(url: str) -> str:
+    """Fetches the fully rendered HTML of a page using Playwright."""
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+        try:
+            # Use networkidle to wait for most dynamic content to load
+            page.goto(url, timeout=20000, wait_until='networkidle')
+            html_content = page.content()
+        except Exception as e:
+            browser.close()
+            return f"Error fetching page with Playwright: {e}"
+        browser.close()
+        return html_content
+def choose_best_table_from_html(html_content: str, task_description: str) -> str:
     """
+    Uses an LLM to identify the best table in the HTML for a given task.
+    Returns a CSS selector for that table.
     """
+    soup = BeautifulSoup(html_content, 'lxml')
+    tables = soup.find_all('table')
+    if not tables:
+        return '{"error": "No tables found on the page."}'
+    table_summaries = []
+    for i, table in enumerate(tables):
+        # Create a unique, stable selector for each table
+        selector = f"table_{i}"
+        table['data-agent-selector'] = selector
+        # Get a small sample of the table's text content
+        rows = table.find_all('tr')
+        sample_text = ""
+        for row in rows[:3]:  # Sample first 3 rows
+            cells = row.find_all(['td', 'th'])
+            sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
+        table_summaries.append({
+            "selector": selector,
+            "sample_data": sample_text.strip()
+        })
+    system_prompt = """
+    You are an expert web scraping assistant. I will provide a list of tables found on a webpage, each with a unique selector and a sample of its data.
+    Based on the user's task, your job is to identify the single best table that contains the relevant information.
+    Respond with a single JSON object containing the selector of the best table, like this: {"selector": "table_1"}
+    """
+    user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
+    try:
+        completion = client.chat.completions.create(
+            model="gpt-4o",
+            response_format={"type": "json_object"},
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+        )
+        # We return the raw JSON string from the LLM
+        return completion.choices[0].message.content
     except Exception as e:
+        return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
+def extract_table_to_dataframe(html_content: str, selector: str) -> (pd.DataFrame | str):
+    """Extracts a specific table from HTML using its selector into a DataFrame."""
+    soup = BeautifulSoup(html_content, 'lxml')
+    # Find the table using our unique data attribute
+    selected_table = soup.find('table', {'data-agent-selector': selector})
+    if not selected_table:
+        return f"Error: Could not find the table with selector '{selector}'."
+    try:
+        # We need to remove our custom attribute before pandas reads it
+        del selected_table['data-agent-selector']
+        df_list = pd.read_html(str(selected_table))
+        if not df_list:
+            return "Error: Pandas could not parse the selected table."
+        return df_list[0]
+    except Exception as e:
+        return f"Error converting table to DataFrame: {e}"