Spaces:

Karthix1
/

data-analyst-agent

Sleeping

App Files Files Community

KarthikMuraliM commited on Sep 4

Commit

74392da

1 Parent(s): fe18036

Add web scraping tool and simple router

Browse files

Files changed (3) hide show

app.py +52 -32
requirements.txt +4 -1
tools.py +38 -0

app.py CHANGED Viewed

@@ -2,13 +2,14 @@
 from fastapi import FastAPI, File, UploadFile, Form
 from typing import List
 import os
-import openai # Import the openai library
 app = FastAPI()
-# Initialize the OpenAI client.
-# It will automatically pick up the OPENAI_API_KEY and OPENAI_BASE_URL
-# from the environment variables (our Hugging Face Secrets).
 client = openai.OpenAI()
 @app.get("/")
@@ -20,31 +21,50 @@ async def analyze_data(
     questions_file: UploadFile = File(..., alias="questions.txt"),
     files: List[UploadFile] = File([], alias="files"),
 ):
-    # Read the content of questions.txt
-    questions_content = await questions_file.read()
-    questions_text = questions_content.decode("utf-8")
-    # --- LLM INTEGRATION ---
-    llm_response_content = "No response from LLM." # Default message
-    try:
-        # Create a simple prompt for the LLM
-        completion = client.chat.completions.create(
-            model="gpt-5-nano", # You can try other models like "mistralai/mistral-7b-instruct"
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": f"Here are the questions I need answered:\n\n{questions_text}\n\nCan you acknowledge that you received them?"}
-            ]
-        )
-        llm_response_content = completion.choices[0].message.content
-    except Exception as e:
-        # If the LLM call fails, we'll know why
-        llm_response_content = f"Error calling LLM: {e}"
-    # --- END LLM INTEGRATION ---
-    # We will build a more structured response later.
-    # For now, just return the raw LLM response.
-    return {
-        "status": "Processing complete",
-        "received_questions": questions_text,
-        "llm_acknowledgement": llm_response_content
-    }

 from fastapi import FastAPI, File, UploadFile, Form
 from typing import List
 import os
+import openai
+import json
+# Import our new tool
+from tools import scrape_url_to_dataframe
 app = FastAPI()
 client = openai.OpenAI()
 @app.get("/")
     questions_file: UploadFile = File(..., alias="questions.txt"),
     files: List[UploadFile] = File([], alias="files"),
 ):
+    questions_text = (await questions_file.read()).decode("utf-8")
+    # --- LLM Decides Which Tool to Use ---
+    # We will use a more advanced agent framework later.
+    # For now, a simple keyword check is enough to demonstrate the concept.
+    if "scrape" in questions_text.lower() and "http" in questions_text.lower():
+        # This is a scraping task. Let's find the URL.
+        url = None
+        for word in questions_text.split():
+            if word.startswith("http"):
+                url = word
+                break
+        if not url:
+            return {"error": "Scraping task detected, but no URL found in the question."}
+        # Call our scraping tool
+        scraped_data = scrape_url_to_dataframe(url)
+        # Check if the tool returned a DataFrame or an error string
+        if isinstance(scraped_data, str):
+            # The tool returned an error
+            return {"error": scraped_data}
+        # For now, just return the first 5 rows of the DataFrame as JSON
+        # In the next step, the LLM will analyze this data.
+        return {
+            "status": "Scraping complete",
+            "url": url,
+            "data_preview": json.loads(scraped_data.head().to_json(orient="records"))
+        }
+    else:
+        # This is a general knowledge task, same as before.
+        try:
+            completion = client.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": questions_text}
+                ]
+            )
+            llm_response = completion.choices[0].message.content
+            return {"status": "LLM query complete", "response": llm_response}
+        except Exception as e:
+            return {"error": f"Error calling LLM: {e}"}

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 fastapi
 uvicorn
 python-multipart # Required for FastAPI to handle file uploads
-openai

 fastapi
 uvicorn
 python-multipart # Required for FastAPI to handle file uploads
+openai
+requests          # Add this line for making HTTP requests
+beautifulsoup4    # Add this line for parsing HTML
+lxml              # Add this line, it's a fast parser for BeautifulSoup

tools.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# tools.py
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+def scrape_url_to_dataframe(url: str) -> (pd.DataFrame | str):
+    """
+    Scrapes a given URL for the first HTML table and returns it as a pandas DataFrame.
+    If no table is found or an error occurs, it returns an error message string.
+    """
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+        soup = BeautifulSoup(response.content, 'lxml')
+        # Find the first table in the HTML. Wikipedia pages often have the main data here.
+        table = soup.find('table', {'class': 'wikitable'})
+        if not table:
+            return "Error: No table with class 'wikitable' found on the page."
+        # Use pandas to read the HTML table directly into a DataFrame
+        # read_html returns a list of DataFrames, we want the first one.
+        df_list = pd.read_html(str(table))
+        if not df_list:
+            return "Error: Pandas could not parse any tables from the HTML."
+        df = df_list[0]
+        return df
+    except requests.exceptions.RequestException as e:
+        return f"Error fetching URL: {e}"
+    except Exception as e:
+        return f"An unexpected error occurred: {e}"