KarthikMuraliM commited on
Commit
e9b9efe
·
1 Parent(s): 8bcc812

Feat: Add Code Interpreter for reliable data analysis and more debugg

Browse files
Files changed (2) hide show
  1. app.py +29 -10
  2. tools.py +1 -1
app.py CHANGED
@@ -76,16 +76,33 @@ async def analyze_data(
76
  df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
77
 
78
  system_prompt = """
79
- You are an expert Python data analyst. You are given a description of a pandas DataFrame named 'df' and a set of questions.
80
- Your task is to write a single Python script to answer these questions.
81
-
82
- Guidelines:
83
- 1. The DataFrame 'df' is already loaded and available in your environment.
84
- 2. First, you MUST perform data cleaning. Pay close attention to columns with symbols like '$', ',', or text that needs to be converted to numbers. Use `pd.to_numeric` and string manipulation (`.str.replace()`). Handle potential errors during conversion by using `errors='coerce'`.
85
- 3. Address each question from the user clearly.
86
- 4. Use the `print()` function to output the final answer for each question. Start each print statement with a clear label (e.g., "Answer 1:", "Answer 2:").
87
- 5. Do not include any example usage, comments, or explanations outside of the Python code block.
88
- 6. The final output of your script should be ONLY the Python code itself.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  """
90
  user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
91
 
@@ -104,6 +121,8 @@ async def analyze_data(
104
  python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
105
 
106
  # Step 5: ACT (Execution) - Run the generated code using our tool
 
 
107
  print("Step 5: Executing generated code.")
108
  execution_result = tools.run_python_code_on_dataframe(df, python_code)
109
 
 
76
  df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
77
 
78
  system_prompt = """
79
+ You are a world-class Python data analyst. You will be given the head of a pandas DataFrame named 'df' and a set of questions.
80
+ Your ONLY job is to write a Python script to answer the questions.
81
+
82
+ **CRITICAL RULES:**
83
+ 1. The DataFrame `df` is already loaded in memory. Do NOT load the data.
84
+ 2. You MUST perform data cleaning first. The data is messy. Columns with numbers might be strings with symbols like '$' or ','. Use `df['col'].replace(...)` and `pd.to_numeric(..., errors='coerce')`.
85
+ 3. For EACH question, you MUST write code to calculate the answer and then immediately print the answer to the console using the `print()` function.
86
+ 4. Each print statement MUST be clear and self-contained.
87
+ 5. Your final output MUST ONLY BE THE PYTHON CODE, with no explanations, comments, or markdown.
88
+
89
+ **EXAMPLE OF A PERFECT SCRIPT:**
90
+ ```python
91
+ import pandas as pd
92
+
93
+ # Data Cleaning
94
+ df['Worldwide gross'] = df['Worldwide gross'].replace({r'\\$': '', r',': ''}, regex=True)
95
+ df['Worldwide gross'] = pd.to_numeric(df['Worldwide gross'], errors='coerce')
96
+ df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
97
+
98
+ # Question 1: How many movies grossed over $2.5B?
99
+ movies_over_2_5bn = df[df['Worldwide gross'] > 2500000000].shape[0]
100
+ print(f"Movies over $2.5B: {movies_over_2_5bn}")
101
+
102
+ # Question 2: What is the average gross of movies released in 2019?
103
+ avg_gross_2019 = df[df['Year'] == 2019]['Worldwide gross'].mean()
104
+ print(f"Average gross for 2019 movies: ${avg_gross_2019:,.2f}")
105
+ ```
106
  """
107
  user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
108
 
 
121
  python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
122
 
123
  # Step 5: ACT (Execution) - Run the generated code using our tool
124
+ print(f"--- Generated Python Code ---\n{python_code}\n-----------------------------")
125
+
126
  print("Step 5: Executing generated code.")
127
  execution_result = tools.run_python_code_on_dataframe(df, python_code)
128
 
tools.py CHANGED
@@ -79,7 +79,7 @@ def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataF
79
  selected_table = tables[table_index]
80
 
81
  try:
82
- df_list = pd.read_html(str(selected_table))
83
  if not df_list:
84
  return "Error: Pandas could not parse the selected table."
85
  return df_list[0]
 
79
  selected_table = tables[table_index]
80
 
81
  try:
82
+ df_list = pd.read_html(io.StringIO(str(selected_table)))
83
  if not df_list:
84
  return "Error: Pandas could not parse the selected table."
85
  return df_list[0]