File size: 4,306 Bytes
bf1736b
 
 
74392da
 
cfd667b
bba9124
74392da
8bcc812
cfd667b
bf1736b
8bcc812
bf1736b
8bcc812
 
 
fe18036
8bcc812
 
 
fe18036
bf1736b
 
8bcc812
bf1736b
 
 
 
 
fe18036
bf1736b
74392da
 
 
bba9124
cfd667b
bba9124
a523805
bba9124
8bcc812
cfd667b
 
8bcc812
bba9124
 
8bcc812
bba9124
8bcc812
 
bba9124
8bcc812
 
 
 
bba9124
 
 
 
 
 
 
 
 
 
f757484
bba9124
f757484
bba9124
 
 
 
 
 
 
 
 
 
a523805
bba9124
 
74392da
 
bba9124
8bcc812
 
 
e9b9efe
bba9124
8bcc812
 
 
 
 
 
74392da
8bcc812
cfd667b
 
bba9124
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# app.py
from fastapi import FastAPI, File, UploadFile, Form
from typing import List
import openai
import json
import pandas as pd
import re

# Import our agent's tools
import tools

# Initialize FastAPI app
app = FastAPI()

# Initialize the OpenAI client.
# It will automatically pick up credentials from Hugging Face Secrets.
client = openai.OpenAI()

# Give the tools module access to the initialized OpenAI client
tools.set_openai_client(client)

@app.get("/")
async def read_root():
    """A simple root endpoint to confirm the API is running."""
    return {"message": "Data Analyst Agent API is running!"}

@app.post("/api/")
async def analyze_data(
    questions_file: UploadFile = File(..., alias="questions.txt"),
    files: List[UploadFile] = File([], alias="files"),
):
    questions_text = (await questions_file.read()).decode("utf-8")
    
    if "scrape" in questions_text.lower() and "http" in questions_text.lower():
        # ... (Steps 1, 2, and 3 are the same: get html, choose table, extract df) ...
        url = next((word for word in questions_text.split() if word.startswith("http")), None)
        if not url: return {"error": "URL not found."}
        html_content = await tools.get_dynamic_html(url)
        if isinstance(html_content, str) and "Error" in html_content: return {"error": html_content}
        choice_json_str = tools.choose_best_table_from_html(html_content, questions_text)
        try:
            choice = json.loads(choice_json_str)
            table_index = choice.get("index")
            if table_index is None: return {"error": "LLM failed to choose table."}
        except: return {"error": "Failed to decode LLM table choice."}
        df = tools.extract_table_to_dataframe(html_content, table_index)
        if isinstance(df, str): return {"error": df}

        # --- STEP 4: GENERATE & EXECUTE PYTHON CODE ---
        print("Step 4: Generating Python code for analysis.")

        df_head = df.head().to_string()
        df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"

        # --- THIS IS THE CRITICAL FIX ---
        # Extract only the numbered questions to prevent the LLM from getting distracted.
        analysis_questions = re.findall(r"^\d+\.\s.*", questions_text, re.MULTILINE)
        cleaned_questions_text = "\n".join(analysis_questions)
        if not cleaned_questions_text:
             # Fallback if no numbered questions are found
            cleaned_questions_text = questions_text
        
        print(f"--- Cleaned Questions for Code Gen ---\n{cleaned_questions_text}\n--------------------------------------")
        # --- END OF FIX ---

        # Final, simplified system prompt
        system_prompt = """
        You are an expert Python data analyst. Your only job is to write a Python script.
        A pandas DataFrame `df` and libraries `pd`, `re`, `plt`, `sns`, `np`, `io`, `base64`, `LinearRegression` are pre-loaded.

        CRITICAL:
        - DO NOT import libraries.
        - DO NOT load data.
        - Your output must be ONLY raw Python code.
        - Clean the `df` DataFrame.
        - For each question, `print()` the answer.
        - For plots, print a base64 data URI.
        """
        
        user_prompt = f"{df_info}\n\nAnswer these questions with a Python script:\n\n{cleaned_questions_text}"

        try:
            completion = client.chat.completions.create(model="gpt-5-nano", messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
            response_content = completion.choices[0].message.content
            python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
            
            print(f"--- Generated Python Code ---\n{python_code}\n-----------------------------")
            
            print("Step 5: Executing generated code.")
            execution_result = tools.run_python_code_on_dataframe(df, python_code)
            
            final_answers = [line for line in execution_result.strip().split('\n') if line.strip()]
            return final_answers

        except Exception as e:
            return {"error": f"An error occurred during code generation or execution: {str(e)}"}

    else:
        return {"response": "This is a non-scraping task."}