KarthikMuraliM commited on
Commit
cfd667b
·
1 Parent(s): 04090ba

Implement dynamic, agentic scraping with Playwright

Browse files
Files changed (4) hide show
  1. Dockerfile +6 -4
  2. app.py +48 -41
  3. requirements.txt +1 -1
  4. tools.py +86 -39
Dockerfile CHANGED
@@ -1,5 +1,6 @@
1
- # Use an official Python runtime as a parent image
2
- FROM python:3.10-slim
 
3
 
4
  # Set the working directory in the container
5
  WORKDIR /app
@@ -7,12 +8,13 @@ WORKDIR /app
7
  # Copy the current directory contents into the container at /app
8
  COPY . /app
9
 
10
- # Install any needed packages specified in requirements.txt
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
 
 
13
  # Make port 7860 available to the world outside this container
14
  EXPOSE 7860
15
 
16
  # Run the uvicorn server when the container launches
17
- # Hugging Face Spaces automatically maps this to your public URL
18
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Use the official Playwright Docker image as a base for simplicity and reliability.
2
+ # It comes with all necessary system dependencies pre-installed.
3
+ FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
4
 
5
  # Set the working directory in the container
6
  WORKDIR /app
 
8
  # Copy the current directory contents into the container at /app
9
  COPY . /app
10
 
11
+ # Install the Python packages specified in our requirements.txt
12
  RUN pip install --no-cache-dir -r requirements.txt
13
 
14
+ # No need to run 'playwright install' as the base image already has browsers.
15
+
16
  # Make port 7860 available to the world outside this container
17
  EXPOSE 7860
18
 
19
  # Run the uvicorn server when the container launches
 
20
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,16 +1,16 @@
1
  # app.py
2
  from fastapi import FastAPI, File, UploadFile, Form
3
  from typing import List
4
- import os
5
  import openai
6
  import json
 
7
 
8
- # Import our new tool
9
- from tools import scrape_url_to_dataframe
10
 
11
  app = FastAPI()
12
-
13
  client = openai.OpenAI()
 
14
 
15
  @app.get("/")
16
  async def read_root():
@@ -22,49 +22,56 @@ async def analyze_data(
22
  files: List[UploadFile] = File([], alias="files"),
23
  ):
24
  questions_text = (await questions_file.read()).decode("utf-8")
25
-
26
- # --- LLM Decides Which Tool to Use ---
27
- # We will use a more advanced agent framework later.
28
- # For now, a simple keyword check is enough to demonstrate the concept.
29
 
30
  if "scrape" in questions_text.lower() and "http" in questions_text.lower():
31
- # This is a scraping task. Let's find the URL.
32
- url = None
33
- for word in questions_text.split():
34
- if word.startswith("http"):
35
- url = word
36
- break
37
-
38
  if not url:
39
- return {"error": "Scraping task detected, but no URL found in the question."}
40
 
41
- # Call our scraping tool
42
- scraped_data = scrape_url_to_dataframe(url)
 
 
 
 
43
 
44
- # Check if the tool returned a DataFrame or an error string
45
- if isinstance(scraped_data, str):
46
- # The tool returned an error
47
- return {"error": scraped_data}
48
 
49
- # For now, just return the first 5 rows of the DataFrame as JSON
50
- # In the next step, the LLM will analyze this data.
51
- return {
52
- "status": "Scraping complete",
53
- "url": url,
54
- "data_preview": json.loads(scraped_data.head().to_json(orient="records"))
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- else:
58
- # This is a general knowledge task, same as before.
59
  try:
60
- completion = client.chat.completions.create(
61
- model="gpt-4o",
62
- messages=[
63
- {"role": "system", "content": "You are a helpful assistant."},
64
- {"role": "user", "content": questions_text}
65
- ]
66
- )
67
- llm_response = completion.choices[0].message.content
68
- return {"status": "LLM query complete", "response": llm_response}
69
  except Exception as e:
70
- return {"error": f"Error calling LLM: {e}"}
 
 
 
 
 
1
  # app.py
2
  from fastapi import FastAPI, File, UploadFile, Form
3
  from typing import List
 
4
  import openai
5
  import json
6
+ import pandas as pd
7
 
8
+ # Import our new set of tools
9
+ import tools
10
 
11
  app = FastAPI()
 
12
  client = openai.OpenAI()
13
+ tools.set_openai_client(client) # Give the tools module access to the client
14
 
15
  @app.get("/")
16
  async def read_root():
 
22
  files: List[UploadFile] = File([], alias="files"),
23
  ):
24
  questions_text = (await questions_file.read()).decode("utf-8")
 
 
 
 
25
 
26
  if "scrape" in questions_text.lower() and "http" in questions_text.lower():
27
+ url = next((word for word in questions_text.split() if word.startswith("http")), None)
 
 
 
 
 
 
28
  if not url:
29
+ return {"error": "Scraping task detected, but no URL was found."}
30
 
31
+ # --- AGENT WORKFLOW ---
32
+ # 1. PERCEIVE: Get the full page content
33
+ print(f"Step 1: Fetching dynamic HTML from {url}")
34
+ html_content = tools.get_dynamic_html(url)
35
+ if "Error" in html_content:
36
+ return {"error": html_content}
37
 
38
+ # 2. DECIDE: Ask LLM to choose the best table for the task
39
+ print("Step 2: Asking LLM to choose the best table.")
40
+ task_description = f"Find a table with the following information: {questions_text}"
41
+ choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
42
 
43
+ try:
44
+ choice = json.loads(choice_json_str)
45
+ if "error" in choice:
46
+ return {"error": choice["error"]}
47
+ selector = choice.get("selector")
48
+ if not selector:
49
+ return {"error": "LLM failed to return a valid selector."}
50
+ except json.JSONDecodeError:
51
+ return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
52
+
53
+ # 3. ACT: Extract the chosen table into a DataFrame
54
+ print(f"Step 3: Extracting table with selector '{selector}'.")
55
+ df_or_error = tools.extract_table_to_dataframe(html_content, selector)
56
+ if isinstance(df_or_error, str):
57
+ return {"error": df_or_error}
58
+
59
+ # --- ANALYSIS (same as before) ---
60
+ print("Step 4: Analyzing data with LLM.")
61
+ data_string = df_or_error.to_csv(index=False)
62
+ if len(data_string) > 15000:
63
+ data_string = df_or_error.head(50).to_csv(index=False)
64
+
65
+ system_prompt = "You are an expert data analyst... respond with a JSON object: {\"answers\": [...]}" # (Same prompt as before)
66
+ user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
67
 
 
 
68
  try:
69
+ completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
70
+ response_data = json.loads(completion.choices[0].message.content)
71
+ return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
 
 
 
 
 
 
72
  except Exception as e:
73
+ return {"error": f"Error during final analysis: {str(e)}"}
74
+
75
+ else:
76
+ # Handle non-scraping tasks here
77
+ return {"response": "This is a non-scraping task."}
requirements.txt CHANGED
@@ -7,4 +7,4 @@ beautifulsoup4 # Add this line for parsing HTML
7
  lxml # Add this line, it's a fast parser for BeautifulSoup
8
  pandas
9
  numpy
10
- # playwright
 
7
  lxml # Add this line, it's a fast parser for BeautifulSoup
8
  pandas
9
  numpy
10
+ playwright
tools.py CHANGED
@@ -1,51 +1,98 @@
1
  # tools.py
2
- import requests
3
- from bs4 import BeautifulSoup
4
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- def scrape_url_to_dataframe(url: str) -> (pd.DataFrame | str):
7
  """
8
- Scrapes a given URL for the first HTML table and returns it as a pandas DataFrame.
9
- If no table is found or an error occurs, it returns an error message string.
10
  """
11
- try:
12
- headers = {
13
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
14
- }
15
- response = requests.get(url, headers=headers, timeout=10)
16
- response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
17
 
18
- soup = BeautifulSoup(response.content, 'lxml')
 
 
 
 
19
 
20
- # Find the first table in the HTML. Wikipedia pages often have the main data here.
21
- table = soup.find('table', {'class': 'wikitable'})
 
 
 
 
22
 
23
- if not table:
24
- return "Error: No table with class 'wikitable' found on the page."
25
-
26
- # Use pandas to read the HTML table directly into a DataFrame
27
- # read_html returns a list of DataFrames, we want the first one.
28
- df_list = pd.read_html(str(table))
29
- if not df_list:
30
- return "Error: Pandas could not parse any tables from the HTML."
31
-
32
- df = df_list[0]
33
- return df
34
 
35
- except requests.exceptions.RequestException as e:
36
- return f"Error fetching URL: {e}"
 
 
 
 
 
 
 
 
 
37
  except Exception as e:
38
- return f"An unexpected error occurred: {e}"
 
 
 
 
39
 
 
 
 
 
 
40
 
41
- # from playwright.sync_api import sync_playwright
42
-
43
- # def scrape_dynamic_url(url: str) -> str:
44
- # """Scrapes a dynamic URL using Playwright and returns the final HTML."""
45
- # with sync_playwright() as p:
46
- # browser = p.chromium.launch()
47
- # page = browser.new_page()
48
- # page.goto(url, wait_until='networkidle') # Wait for network activity to cease
49
- # html_content = page.content()
50
- # browser.close()
51
- # return html_content
 
1
  # tools.py
 
 
2
  import pandas as pd
3
+ from playwright.sync_api import sync_playwright
4
+ from bs4 import BeautifulSoup
5
+ import json
6
+ import openai
7
+
8
+ # Use the client initialized in the main app
9
+ client = None
10
+
11
+ def set_openai_client(c):
12
+ global client
13
+ client = c
14
+
15
+ def get_dynamic_html(url: str) -> str:
16
+ """Fetches the fully rendered HTML of a page using Playwright."""
17
+ with sync_playwright() as p:
18
+ browser = p.chromium.launch()
19
+ page = browser.new_page()
20
+ try:
21
+ # Use networkidle to wait for most dynamic content to load
22
+ page.goto(url, timeout=20000, wait_until='networkidle')
23
+ html_content = page.content()
24
+ except Exception as e:
25
+ browser.close()
26
+ return f"Error fetching page with Playwright: {e}"
27
+ browser.close()
28
+ return html_content
29
 
30
+ def choose_best_table_from_html(html_content: str, task_description: str) -> str:
31
  """
32
+ Uses an LLM to identify the best table in the HTML for a given task.
33
+ Returns a CSS selector for that table.
34
  """
35
+ soup = BeautifulSoup(html_content, 'lxml')
36
+ tables = soup.find_all('table')
37
+
38
+ if not tables:
39
+ return '{"error": "No tables found on the page."}'
 
40
 
41
+ table_summaries = []
42
+ for i, table in enumerate(tables):
43
+ # Create a unique, stable selector for each table
44
+ selector = f"table_{i}"
45
+ table['data-agent-selector'] = selector
46
 
47
+ # Get a small sample of the table's text content
48
+ rows = table.find_all('tr')
49
+ sample_text = ""
50
+ for row in rows[:3]: # Sample first 3 rows
51
+ cells = row.find_all(['td', 'th'])
52
+ sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
53
 
54
+ table_summaries.append({
55
+ "selector": selector,
56
+ "sample_data": sample_text.strip()
57
+ })
58
+
59
+ system_prompt = """
60
+ You are an expert web scraping assistant. I will provide a list of tables found on a webpage, each with a unique selector and a sample of its data.
61
+ Based on the user's task, your job is to identify the single best table that contains the relevant information.
62
+ Respond with a single JSON object containing the selector of the best table, like this: {"selector": "table_1"}
63
+ """
64
+ user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
65
 
66
+ try:
67
+ completion = client.chat.completions.create(
68
+ model="gpt-4o",
69
+ response_format={"type": "json_object"},
70
+ messages=[
71
+ {"role": "system", "content": system_prompt},
72
+ {"role": "user", "content": user_prompt}
73
+ ]
74
+ )
75
+ # We return the raw JSON string from the LLM
76
+ return completion.choices[0].message.content
77
  except Exception as e:
78
+ return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
79
+
80
+ def extract_table_to_dataframe(html_content: str, selector: str) -> (pd.DataFrame | str):
81
+ """Extracts a specific table from HTML using its selector into a DataFrame."""
82
+ soup = BeautifulSoup(html_content, 'lxml')
83
 
84
+ # Find the table using our unique data attribute
85
+ selected_table = soup.find('table', {'data-agent-selector': selector})
86
+
87
+ if not selected_table:
88
+ return f"Error: Could not find the table with selector '{selector}'."
89
 
90
+ try:
91
+ # We need to remove our custom attribute before pandas reads it
92
+ del selected_table['data-agent-selector']
93
+ df_list = pd.read_html(str(selected_table))
94
+ if not df_list:
95
+ return "Error: Pandas could not parse the selected table."
96
+ return df_list[0]
97
+ except Exception as e:
98
+ return f"Error converting table to DataFrame: {e}"