Spaces:
Sleeping
Sleeping
Commit
·
8de87af
1
Parent(s):
7b336c9
Fix: Refactor scraping to use table index instead of custom selector
Browse files
tools.py
CHANGED
|
@@ -1,20 +1,17 @@
|
|
| 1 |
-
# tools.py
|
| 2 |
import pandas as pd
|
| 3 |
from playwright.async_api import async_playwright
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import json
|
| 6 |
import openai
|
| 7 |
|
| 8 |
-
# Use the client initialized in the main app
|
| 9 |
client = None
|
| 10 |
-
|
| 11 |
def set_openai_client(c):
|
| 12 |
global client
|
| 13 |
client = c
|
| 14 |
|
| 15 |
async def get_dynamic_html(url: str) -> str:
|
| 16 |
-
|
| 17 |
-
# 'async with' is the asynchronous context manager
|
| 18 |
async with async_playwright() as p:
|
| 19 |
browser = await p.chromium.launch()
|
| 20 |
page = await browser.new_page()
|
|
@@ -29,8 +26,8 @@ async def get_dynamic_html(url: str) -> str:
|
|
| 29 |
|
| 30 |
def choose_best_table_from_html(html_content: str, task_description: str) -> str:
|
| 31 |
"""
|
| 32 |
-
Uses an LLM to identify the best table
|
| 33 |
-
Returns a
|
| 34 |
"""
|
| 35 |
soup = BeautifulSoup(html_content, 'lxml')
|
| 36 |
tables = soup.find_all('table')
|
|
@@ -40,56 +37,45 @@ def choose_best_table_from_html(html_content: str, task_description: str) -> str
|
|
| 40 |
|
| 41 |
table_summaries = []
|
| 42 |
for i, table in enumerate(tables):
|
| 43 |
-
# Create a unique, stable selector for each table
|
| 44 |
-
selector = f"table_{i}"
|
| 45 |
-
table['data-agent-selector'] = selector
|
| 46 |
-
|
| 47 |
-
# Get a small sample of the table's text content
|
| 48 |
rows = table.find_all('tr')
|
| 49 |
sample_text = ""
|
| 50 |
-
for row in rows[:3]:
|
| 51 |
cells = row.find_all(['td', 'th'])
|
| 52 |
sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
|
| 53 |
|
| 54 |
table_summaries.append({
|
| 55 |
-
"
|
| 56 |
"sample_data": sample_text.strip()
|
| 57 |
})
|
| 58 |
|
| 59 |
system_prompt = """
|
| 60 |
-
You are an expert web scraping assistant. I will provide a list of tables
|
| 61 |
-
Based on the user's task, your job is to identify the single best table
|
| 62 |
-
Respond with a single JSON object containing the
|
| 63 |
"""
|
| 64 |
user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
|
| 65 |
|
| 66 |
try:
|
| 67 |
completion = client.chat.completions.create(
|
| 68 |
-
model="gpt-
|
| 69 |
response_format={"type": "json_object"},
|
| 70 |
-
messages=[
|
| 71 |
-
{"role": "system", "content": system_prompt},
|
| 72 |
-
{"role": "user", "content": user_prompt}
|
| 73 |
-
]
|
| 74 |
)
|
| 75 |
-
# We return the raw JSON string from the LLM
|
| 76 |
return completion.choices[0].message.content
|
| 77 |
except Exception as e:
|
| 78 |
return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
|
| 79 |
|
| 80 |
-
def extract_table_to_dataframe(html_content: str,
|
| 81 |
-
"""Extracts a specific table from HTML using its
|
| 82 |
soup = BeautifulSoup(html_content, 'lxml')
|
|
|
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
if not selected_table:
|
| 88 |
-
return f"Error: Could not find the table with selector '{selector}'."
|
| 89 |
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
-
# We need to remove our custom attribute before pandas reads it
|
| 92 |
-
del selected_table['data-agent-selector']
|
| 93 |
df_list = pd.read_html(str(selected_table))
|
| 94 |
if not df_list:
|
| 95 |
return "Error: Pandas could not parse the selected table."
|
|
|
|
| 1 |
+
# tools.py (Index-based Version)
|
| 2 |
import pandas as pd
|
| 3 |
from playwright.async_api import async_playwright
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import json
|
| 6 |
import openai
|
| 7 |
|
|
|
|
| 8 |
client = None
|
|
|
|
| 9 |
def set_openai_client(c):
|
| 10 |
global client
|
| 11 |
client = c
|
| 12 |
|
| 13 |
async def get_dynamic_html(url: str) -> str:
|
| 14 |
+
# This function remains the same
|
|
|
|
| 15 |
async with async_playwright() as p:
|
| 16 |
browser = await p.chromium.launch()
|
| 17 |
page = await browser.new_page()
|
|
|
|
| 26 |
|
| 27 |
def choose_best_table_from_html(html_content: str, task_description: str) -> str:
|
| 28 |
"""
|
| 29 |
+
Uses an LLM to identify the best table by its INDEX.
|
| 30 |
+
Returns a JSON object with the table's index, e.g., {"index": 0}.
|
| 31 |
"""
|
| 32 |
soup = BeautifulSoup(html_content, 'lxml')
|
| 33 |
tables = soup.find_all('table')
|
|
|
|
| 37 |
|
| 38 |
table_summaries = []
|
| 39 |
for i, table in enumerate(tables):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
rows = table.find_all('tr')
|
| 41 |
sample_text = ""
|
| 42 |
+
for row in rows[:3]:
|
| 43 |
cells = row.find_all(['td', 'th'])
|
| 44 |
sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
|
| 45 |
|
| 46 |
table_summaries.append({
|
| 47 |
+
"index": i, # Use the index as the identifier
|
| 48 |
"sample_data": sample_text.strip()
|
| 49 |
})
|
| 50 |
|
| 51 |
system_prompt = """
|
| 52 |
+
You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index.
|
| 53 |
+
Based on the user's task, your job is to identify the single best table.
|
| 54 |
+
Respond with a single JSON object containing the index of the best table, like this: {"index": 1}
|
| 55 |
"""
|
| 56 |
user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
|
| 57 |
|
| 58 |
try:
|
| 59 |
completion = client.chat.completions.create(
|
| 60 |
+
model="gpt-4o",
|
| 61 |
response_format={"type": "json_object"},
|
| 62 |
+
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
|
|
|
|
|
|
|
|
|
| 63 |
)
|
|
|
|
| 64 |
return completion.choices[0].message.content
|
| 65 |
except Exception as e:
|
| 66 |
return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
|
| 67 |
|
| 68 |
+
def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame | str):
|
| 69 |
+
"""Extracts a specific table from HTML using its index into a DataFrame."""
|
| 70 |
soup = BeautifulSoup(html_content, 'lxml')
|
| 71 |
+
tables = soup.find_all('table')
|
| 72 |
|
| 73 |
+
if not 0 <= table_index < len(tables):
|
| 74 |
+
return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found."
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
selected_table = tables[table_index]
|
| 77 |
+
|
| 78 |
try:
|
|
|
|
|
|
|
| 79 |
df_list = pd.read_html(str(selected_table))
|
| 80 |
if not df_list:
|
| 81 |
return "Error: Pandas could not parse the selected table."
|