# app.py from fastapi import FastAPI, File, UploadFile, Form from typing import List import openai import json import pandas as pd import re # Import our agent's tools import tools # Initialize FastAPI app app = FastAPI() # Initialize the OpenAI client. # It will automatically pick up credentials from Hugging Face Secrets. client = openai.OpenAI() # Give the tools module access to the initialized OpenAI client tools.set_openai_client(client) @app.get("/") async def read_root(): """A simple root endpoint to confirm the API is running.""" return {"message": "Data Analyst Agent API is running!"} @app.post("/api/") async def analyze_data( questions_file: UploadFile = File(..., alias="questions.txt"), files: List[UploadFile] = File([], alias="files"), ): questions_text = (await questions_file.read()).decode("utf-8") if "scrape" in questions_text.lower() and "http" in questions_text.lower(): # ... (Steps 1, 2, and 3 are the same: get html, choose table, extract df) ... url = next((word for word in questions_text.split() if word.startswith("http")), None) if not url: return {"error": "URL not found."} html_content = await tools.get_dynamic_html(url) if isinstance(html_content, str) and "Error" in html_content: return {"error": html_content} choice_json_str = tools.choose_best_table_from_html(html_content, questions_text) try: choice = json.loads(choice_json_str) table_index = choice.get("index") if table_index is None: return {"error": "LLM failed to choose table."} except: return {"error": "Failed to decode LLM table choice."} df = tools.extract_table_to_dataframe(html_content, table_index) if isinstance(df, str): return {"error": df} # --- STEP 4: GENERATE & EXECUTE PYTHON CODE --- print("Step 4: Generating Python code for analysis.") df_head = df.head().to_string() df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}" # --- THIS IS THE CRITICAL FIX --- # Extract only the numbered questions to prevent the LLM from getting distracted. analysis_questions = re.findall(r"^\d+\.\s.*", questions_text, re.MULTILINE) cleaned_questions_text = "\n".join(analysis_questions) if not cleaned_questions_text: # Fallback if no numbered questions are found cleaned_questions_text = questions_text print(f"--- Cleaned Questions for Code Gen ---\n{cleaned_questions_text}\n--------------------------------------") # --- END OF FIX --- # Final, simplified system prompt system_prompt = """ You are an expert Python data analyst. Your only job is to write a Python script. A pandas DataFrame `df` and libraries `pd`, `re`, `plt`, `sns`, `np`, `io`, `base64`, `LinearRegression` are pre-loaded. CRITICAL: - DO NOT import libraries. - DO NOT load data. - Your output must be ONLY raw Python code. - Clean the `df` DataFrame. - For each question, `print()` the answer. - For plots, print a base64 data URI. """ user_prompt = f"{df_info}\n\nAnswer these questions with a Python script:\n\n{cleaned_questions_text}" try: completion = client.chat.completions.create(model="gpt-5-nano", messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]) response_content = completion.choices[0].message.content python_code = response_content.strip().replace("```python", "").replace("```", "").strip() print(f"--- Generated Python Code ---\n{python_code}\n-----------------------------") print("Step 5: Executing generated code.") execution_result = tools.run_python_code_on_dataframe(df, python_code) final_answers = [line for line in execution_result.strip().split('\n') if line.strip()] return final_answers except Exception as e: return {"error": f"An error occurred during code generation or execution: {str(e)}"} else: return {"response": "This is a non-scraping task."}