KarthikMuraliM commited on
Commit
8de87af
·
1 Parent(s): 7b336c9

Fix: Refactor scraping to use table index instead of custom selector

Browse files
Files changed (1) hide show
  1. tools.py +18 -32
tools.py CHANGED
@@ -1,20 +1,17 @@
1
- # tools.py
2
  import pandas as pd
3
  from playwright.async_api import async_playwright
4
  from bs4 import BeautifulSoup
5
  import json
6
  import openai
7
 
8
- # Use the client initialized in the main app
9
  client = None
10
-
11
  def set_openai_client(c):
12
  global client
13
  client = c
14
 
15
  async def get_dynamic_html(url: str) -> str:
16
- """Fetches the fully rendered HTML of a page using Playwright's ASYNC API."""
17
- # 'async with' is the asynchronous context manager
18
  async with async_playwright() as p:
19
  browser = await p.chromium.launch()
20
  page = await browser.new_page()
@@ -29,8 +26,8 @@ async def get_dynamic_html(url: str) -> str:
29
 
30
  def choose_best_table_from_html(html_content: str, task_description: str) -> str:
31
  """
32
- Uses an LLM to identify the best table in the HTML for a given task.
33
- Returns a CSS selector for that table.
34
  """
35
  soup = BeautifulSoup(html_content, 'lxml')
36
  tables = soup.find_all('table')
@@ -40,56 +37,45 @@ def choose_best_table_from_html(html_content: str, task_description: str) -> str
40
 
41
  table_summaries = []
42
  for i, table in enumerate(tables):
43
- # Create a unique, stable selector for each table
44
- selector = f"table_{i}"
45
- table['data-agent-selector'] = selector
46
-
47
- # Get a small sample of the table's text content
48
  rows = table.find_all('tr')
49
  sample_text = ""
50
- for row in rows[:3]: # Sample first 3 rows
51
  cells = row.find_all(['td', 'th'])
52
  sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
53
 
54
  table_summaries.append({
55
- "selector": selector,
56
  "sample_data": sample_text.strip()
57
  })
58
 
59
  system_prompt = """
60
- You are an expert web scraping assistant. I will provide a list of tables found on a webpage, each with a unique selector and a sample of its data.
61
- Based on the user's task, your job is to identify the single best table that contains the relevant information.
62
- Respond with a single JSON object containing the selector of the best table, like this: {"selector": "table_1"}
63
  """
64
  user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
65
 
66
  try:
67
  completion = client.chat.completions.create(
68
- model="gpt-5-nano",
69
  response_format={"type": "json_object"},
70
- messages=[
71
- {"role": "system", "content": system_prompt},
72
- {"role": "user", "content": user_prompt}
73
- ]
74
  )
75
- # We return the raw JSON string from the LLM
76
  return completion.choices[0].message.content
77
  except Exception as e:
78
  return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
79
 
80
- def extract_table_to_dataframe(html_content: str, selector: str) -> (pd.DataFrame | str):
81
- """Extracts a specific table from HTML using its selector into a DataFrame."""
82
  soup = BeautifulSoup(html_content, 'lxml')
 
83
 
84
- # Find the table using our unique data attribute
85
- selected_table = soup.find('table', {'data-agent-selector': selector})
86
-
87
- if not selected_table:
88
- return f"Error: Could not find the table with selector '{selector}'."
89
 
 
 
90
  try:
91
- # We need to remove our custom attribute before pandas reads it
92
- del selected_table['data-agent-selector']
93
  df_list = pd.read_html(str(selected_table))
94
  if not df_list:
95
  return "Error: Pandas could not parse the selected table."
 
1
+ # tools.py (Index-based Version)
2
  import pandas as pd
3
  from playwright.async_api import async_playwright
4
  from bs4 import BeautifulSoup
5
  import json
6
  import openai
7
 
 
8
  client = None
 
9
  def set_openai_client(c):
10
  global client
11
  client = c
12
 
13
  async def get_dynamic_html(url: str) -> str:
14
+ # This function remains the same
 
15
  async with async_playwright() as p:
16
  browser = await p.chromium.launch()
17
  page = await browser.new_page()
 
26
 
27
  def choose_best_table_from_html(html_content: str, task_description: str) -> str:
28
  """
29
+ Uses an LLM to identify the best table by its INDEX.
30
+ Returns a JSON object with the table's index, e.g., {"index": 0}.
31
  """
32
  soup = BeautifulSoup(html_content, 'lxml')
33
  tables = soup.find_all('table')
 
37
 
38
  table_summaries = []
39
  for i, table in enumerate(tables):
 
 
 
 
 
40
  rows = table.find_all('tr')
41
  sample_text = ""
42
+ for row in rows[:3]:
43
  cells = row.find_all(['td', 'th'])
44
  sample_text += " | ".join(cell.get_text(strip=True) for cell in cells[:4]) + "\n"
45
 
46
  table_summaries.append({
47
+ "index": i, # Use the index as the identifier
48
  "sample_data": sample_text.strip()
49
  })
50
 
51
  system_prompt = """
52
+ You are an expert web scraping assistant. I will provide a list of tables, each identified by a numerical index.
53
+ Based on the user's task, your job is to identify the single best table.
54
+ Respond with a single JSON object containing the index of the best table, like this: {"index": 1}
55
  """
56
  user_prompt = f"User's task: '{task_description}'\n\nHere are the tables I found:\n{json.dumps(table_summaries, indent=2)}"
57
 
58
  try:
59
  completion = client.chat.completions.create(
60
+ model="gpt-4o",
61
  response_format={"type": "json_object"},
62
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
 
 
63
  )
 
64
  return completion.choices[0].message.content
65
  except Exception as e:
66
  return f'{{"error": "LLM error in choosing table: {str(e)}"}}'
67
 
68
+ def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataFrame | str):
69
+ """Extracts a specific table from HTML using its index into a DataFrame."""
70
  soup = BeautifulSoup(html_content, 'lxml')
71
+ tables = soup.find_all('table')
72
 
73
+ if not 0 <= table_index < len(tables):
74
+ return f"Error: Invalid table index {table_index}. Only {len(tables)} tables were found."
 
 
 
75
 
76
+ selected_table = tables[table_index]
77
+
78
  try:
 
 
79
  df_list = pd.read_html(str(selected_table))
80
  if not df_list:
81
  return "Error: Pandas could not parse the selected table."