KarthikMuraliM commited on
Commit
bba9124
·
1 Parent(s): aa9714c

Final Fix: Clean the prompt sent to the code generator

Browse files
Files changed (1) hide show
  1. app.py +34 -67
app.py CHANGED
@@ -4,6 +4,7 @@ from typing import List
4
  import openai
5
  import json
6
  import pandas as pd
 
7
 
8
  # Import our agent's tools
9
  import tools
@@ -28,105 +29,71 @@ async def analyze_data(
28
  questions_file: UploadFile = File(..., alias="questions.txt"),
29
  files: List[UploadFile] = File([], alias="files"),
30
  ):
31
- """
32
- Main endpoint to handle data analysis tasks. It orchestrates scraping,
33
- data extraction, code generation, and code execution.
34
- """
35
  questions_text = (await questions_file.read()).decode("utf-8")
36
 
37
- # Simple router: Check if the task involves scraping a URL.
38
  if "scrape" in questions_text.lower() and "http" in questions_text.lower():
39
-
40
- # --- AGENT WORKFLOW ---
41
-
42
- # Step 1: PERCEIVE - Get the fully rendered HTML from the URL using Playwright
43
- print("Step 1: Fetching dynamic HTML from URL...")
44
  url = next((word for word in questions_text.split() if word.startswith("http")), None)
45
- if not url:
46
- return {"error": "Scraping task detected, but no URL was found."}
47
-
48
  html_content = await tools.get_dynamic_html(url)
49
- if isinstance(html_content, str) and "Error" in html_content:
50
- return {"error": html_content}
51
-
52
- # Step 2: DECIDE - Ask the LLM to identify the best table to use for the task
53
- print("Step 2: Asking LLM to choose the best table index...")
54
  choice_json_str = tools.choose_best_table_from_html(html_content, questions_text)
55
  try:
56
  choice = json.loads(choice_json_str)
57
- if "error" in choice:
58
- return {"error": choice["error"]}
59
  table_index = choice.get("index")
60
- if table_index is None or not isinstance(table_index, int):
61
- return {"error": "LLM failed to return a valid integer index for the table."}
62
- except (json.JSONDecodeError, TypeError):
63
- return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
64
-
65
- # Step 3: ACT (Extraction) - Extract the chosen table into a pandas DataFrame
66
- print(f"Step 3: Extracting table with index '{table_index}'...")
67
  df = tools.extract_table_to_dataframe(html_content, table_index)
68
- if isinstance(df, str):
69
- return {"error": df}
70
 
71
  # --- STEP 4: GENERATE & EXECUTE PYTHON CODE ---
72
- print("Step 4: Generating Python code for analysis...")
73
 
74
- # Prepare a concise summary of the DataFrame for the LLM prompt
75
  df_head = df.head().to_string()
76
  df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
77
 
78
- # system_prompt = """
79
- # You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
80
-
81
- # **URGENT AND CRITICAL INSTRUCTION:**
82
- # DO NOT write any code to read or load data (e.g., from a URL or file). The DataFrame `df` is ALREADY in memory. Start your script as if `df` is already defined.
 
 
 
 
 
83
 
84
- # **Your script MUST:**
85
- # 1. Perform data cleaning on the `df` DataFrame. Columns that look like numbers may be strings with '$' or ',' symbols.
86
- # 2. For EACH question the user asks, you MUST `print()` the final answer.
87
- # 3. Your entire output must be ONLY the raw Python code. No markdown, no comments, no explanations.
88
- # """
89
  system_prompt = """
90
- You are a Python script generator. Your only output is code.
91
- A pandas DataFrame named `df` and the following libraries are pre-loaded: `pd`, `re`, `plt`, `sns`, `np`, `io`, `base64`, `LinearRegression`.
92
-
93
- **CRITICAL:**
94
- - DO NOT import any libraries.
95
- - DO NOT load any data.
96
- - Write a script that cleans the `df` DataFrame and then prints the answers to the user's questions.
97
- - For plots, print a base64 data URI using the provided recipe.
 
 
98
  """
99
-
100
- user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
101
 
102
  try:
103
- # Generate the Python code using the LLM
104
- completion = client.chat.completions.create(
105
- model="gpt-5-nano",
106
- messages=[
107
- {"role": "system", "content": system_prompt},
108
- {"role": "user", "content": user_prompt}
109
- ]
110
- )
111
  response_content = completion.choices[0].message.content
112
-
113
- # Extract the code from the markdown block (e.g., ```python\n...\n```)
114
  python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
115
 
116
- # Step 5: ACT (Execution) - Run the generated code using our tool
117
  print(f"--- Generated Python Code ---\n{python_code}\n-----------------------------")
118
-
119
  print("Step 5: Executing generated code.")
120
  execution_result = tools.run_python_code_on_dataframe(df, python_code)
121
 
122
- # The result is the captured print output. Format it into a JSON array of strings.
123
  final_answers = [line for line in execution_result.strip().split('\n') if line.strip()]
124
-
125
  return final_answers
126
 
127
  except Exception as e:
128
  return {"error": f"An error occurred during code generation or execution: {str(e)}"}
129
 
130
  else:
131
- # Handle non-scraping, general knowledge tasks
132
- return {"response": "This is a non-scraping task."}
 
4
  import openai
5
  import json
6
  import pandas as pd
7
+ import re
8
 
9
  # Import our agent's tools
10
  import tools
 
29
  questions_file: UploadFile = File(..., alias="questions.txt"),
30
  files: List[UploadFile] = File([], alias="files"),
31
  ):
 
 
 
 
32
  questions_text = (await questions_file.read()).decode("utf-8")
33
 
 
34
  if "scrape" in questions_text.lower() and "http" in questions_text.lower():
35
+ # ... (Steps 1, 2, and 3 are the same: get html, choose table, extract df) ...
 
 
 
 
36
  url = next((word for word in questions_text.split() if word.startswith("http")), None)
37
+ if not url: return {"error": "URL not found."}
 
 
38
  html_content = await tools.get_dynamic_html(url)
39
+ if isinstance(html_content, str) and "Error" in html_content: return {"error": html_content}
 
 
 
 
40
  choice_json_str = tools.choose_best_table_from_html(html_content, questions_text)
41
  try:
42
  choice = json.loads(choice_json_str)
 
 
43
  table_index = choice.get("index")
44
+ if table_index is None: return {"error": "LLM failed to choose table."}
45
+ except: return {"error": "Failed to decode LLM table choice."}
 
 
 
 
 
46
  df = tools.extract_table_to_dataframe(html_content, table_index)
47
+ if isinstance(df, str): return {"error": df}
 
48
 
49
  # --- STEP 4: GENERATE & EXECUTE PYTHON CODE ---
50
+ print("Step 4: Generating Python code for analysis.")
51
 
 
52
  df_head = df.head().to_string()
53
  df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
54
 
55
+ # --- THIS IS THE CRITICAL FIX ---
56
+ # Extract only the numbered questions to prevent the LLM from getting distracted.
57
+ analysis_questions = re.findall(r"^\d+\.\s.*", questions_text, re.MULTILINE)
58
+ cleaned_questions_text = "\n".join(analysis_questions)
59
+ if not cleaned_questions_text:
60
+ # Fallback if no numbered questions are found
61
+ cleaned_questions_text = questions_text
62
+
63
+ print(f"--- Cleaned Questions for Code Gen ---\n{cleaned_questions_text}\n--------------------------------------")
64
+ # --- END OF FIX ---
65
 
66
+ # Final, simplified system prompt
 
 
 
 
67
  system_prompt = """
68
+ You are an expert Python data analyst. Your only job is to write a Python script.
69
+ A pandas DataFrame `df` and libraries `pd`, `re`, `plt`, `sns`, `np`, `io`, `base64`, `LinearRegression` are pre-loaded.
70
+
71
+ CRITICAL:
72
+ - DO NOT import libraries.
73
+ - DO NOT load data.
74
+ - Your output must be ONLY raw Python code.
75
+ - Clean the `df` DataFrame.
76
+ - For each question, `print()` the answer.
77
+ - For plots, print a base64 data URI.
78
  """
79
+
80
+ user_prompt = f"{df_info}\n\nAnswer these questions with a Python script:\n\n{cleaned_questions_text}"
81
 
82
  try:
83
+ completion = client.chat.completions.create(model="gpt-5-nano", messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
 
 
 
 
 
 
 
84
  response_content = completion.choices[0].message.content
 
 
85
  python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
86
 
 
87
  print(f"--- Generated Python Code ---\n{python_code}\n-----------------------------")
88
+
89
  print("Step 5: Executing generated code.")
90
  execution_result = tools.run_python_code_on_dataframe(df, python_code)
91
 
 
92
  final_answers = [line for line in execution_result.strip().split('\n') if line.strip()]
 
93
  return final_answers
94
 
95
  except Exception as e:
96
  return {"error": f"An error occurred during code generation or execution: {str(e)}"}
97
 
98
  else:
99
+ return {"response": "This is a non-scraping task."}