KarthikMuraliM commited on
Commit
74392da
·
1 Parent(s): fe18036

Add web scraping tool and simple router

Browse files
Files changed (3) hide show
  1. app.py +52 -32
  2. requirements.txt +4 -1
  3. tools.py +38 -0
app.py CHANGED
@@ -2,13 +2,14 @@
2
  from fastapi import FastAPI, File, UploadFile, Form
3
  from typing import List
4
  import os
5
- import openai # Import the openai library
 
 
 
 
6
 
7
  app = FastAPI()
8
 
9
- # Initialize the OpenAI client.
10
- # It will automatically pick up the OPENAI_API_KEY and OPENAI_BASE_URL
11
- # from the environment variables (our Hugging Face Secrets).
12
  client = openai.OpenAI()
13
 
14
  @app.get("/")
@@ -20,31 +21,50 @@ async def analyze_data(
20
  questions_file: UploadFile = File(..., alias="questions.txt"),
21
  files: List[UploadFile] = File([], alias="files"),
22
  ):
23
- # Read the content of questions.txt
24
- questions_content = await questions_file.read()
25
- questions_text = questions_content.decode("utf-8")
26
-
27
- # --- LLM INTEGRATION ---
28
- llm_response_content = "No response from LLM." # Default message
29
- try:
30
- # Create a simple prompt for the LLM
31
- completion = client.chat.completions.create(
32
- model="gpt-5-nano", # You can try other models like "mistralai/mistral-7b-instruct"
33
- messages=[
34
- {"role": "system", "content": "You are a helpful assistant."},
35
- {"role": "user", "content": f"Here are the questions I need answered:\n\n{questions_text}\n\nCan you acknowledge that you received them?"}
36
- ]
37
- )
38
- llm_response_content = completion.choices[0].message.content
39
- except Exception as e:
40
- # If the LLM call fails, we'll know why
41
- llm_response_content = f"Error calling LLM: {e}"
42
- # --- END LLM INTEGRATION ---
43
-
44
- # We will build a more structured response later.
45
- # For now, just return the raw LLM response.
46
- return {
47
- "status": "Processing complete",
48
- "received_questions": questions_text,
49
- "llm_acknowledgement": llm_response_content
50
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from fastapi import FastAPI, File, UploadFile, Form
3
  from typing import List
4
  import os
5
+ import openai
6
+ import json
7
+
8
+ # Import our new tool
9
+ from tools import scrape_url_to_dataframe
10
 
11
  app = FastAPI()
12
 
 
 
 
13
  client = openai.OpenAI()
14
 
15
  @app.get("/")
 
21
  questions_file: UploadFile = File(..., alias="questions.txt"),
22
  files: List[UploadFile] = File([], alias="files"),
23
  ):
24
+ questions_text = (await questions_file.read()).decode("utf-8")
25
+
26
+ # --- LLM Decides Which Tool to Use ---
27
+ # We will use a more advanced agent framework later.
28
+ # For now, a simple keyword check is enough to demonstrate the concept.
29
+
30
+ if "scrape" in questions_text.lower() and "http" in questions_text.lower():
31
+ # This is a scraping task. Let's find the URL.
32
+ url = None
33
+ for word in questions_text.split():
34
+ if word.startswith("http"):
35
+ url = word
36
+ break
37
+
38
+ if not url:
39
+ return {"error": "Scraping task detected, but no URL found in the question."}
40
+
41
+ # Call our scraping tool
42
+ scraped_data = scrape_url_to_dataframe(url)
43
+
44
+ # Check if the tool returned a DataFrame or an error string
45
+ if isinstance(scraped_data, str):
46
+ # The tool returned an error
47
+ return {"error": scraped_data}
48
+
49
+ # For now, just return the first 5 rows of the DataFrame as JSON
50
+ # In the next step, the LLM will analyze this data.
51
+ return {
52
+ "status": "Scraping complete",
53
+ "url": url,
54
+ "data_preview": json.loads(scraped_data.head().to_json(orient="records"))
55
+ }
56
+
57
+ else:
58
+ # This is a general knowledge task, same as before.
59
+ try:
60
+ completion = client.chat.completions.create(
61
+ model="gpt-4o",
62
+ messages=[
63
+ {"role": "system", "content": "You are a helpful assistant."},
64
+ {"role": "user", "content": questions_text}
65
+ ]
66
+ )
67
+ llm_response = completion.choices[0].message.content
68
+ return {"status": "LLM query complete", "response": llm_response}
69
+ except Exception as e:
70
+ return {"error": f"Error calling LLM: {e}"}
requirements.txt CHANGED
@@ -1,4 +1,7 @@
1
  fastapi
2
  uvicorn
3
  python-multipart # Required for FastAPI to handle file uploads
4
- openai
 
 
 
 
1
  fastapi
2
  uvicorn
3
  python-multipart # Required for FastAPI to handle file uploads
4
+ openai
5
+ requests # Add this line for making HTTP requests
6
+ beautifulsoup4 # Add this line for parsing HTML
7
+ lxml # Add this line, it's a fast parser for BeautifulSoup
tools.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tools.py
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+
6
+ def scrape_url_to_dataframe(url: str) -> (pd.DataFrame | str):
7
+ """
8
+ Scrapes a given URL for the first HTML table and returns it as a pandas DataFrame.
9
+ If no table is found or an error occurs, it returns an error message string.
10
+ """
11
+ try:
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
14
+ }
15
+ response = requests.get(url, headers=headers, timeout=10)
16
+ response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
17
+
18
+ soup = BeautifulSoup(response.content, 'lxml')
19
+
20
+ # Find the first table in the HTML. Wikipedia pages often have the main data here.
21
+ table = soup.find('table', {'class': 'wikitable'})
22
+
23
+ if not table:
24
+ return "Error: No table with class 'wikitable' found on the page."
25
+
26
+ # Use pandas to read the HTML table directly into a DataFrame
27
+ # read_html returns a list of DataFrames, we want the first one.
28
+ df_list = pd.read_html(str(table))
29
+ if not df_list:
30
+ return "Error: Pandas could not parse any tables from the HTML."
31
+
32
+ df = df_list[0]
33
+ return df
34
+
35
+ except requests.exceptions.RequestException as e:
36
+ return f"Error fetching URL: {e}"
37
+ except Exception as e:
38
+ return f"An unexpected error occurred: {e}"