KarthikMuraliM commited on
Commit
98933b3
·
1 Parent(s): 4fda5be

Feat: Add data visualization capabilities

Browse files
Files changed (3) hide show
  1. app.py +38 -12
  2. requirements.txt +4 -1
  3. tools.py +25 -8
app.py CHANGED
@@ -75,20 +75,46 @@ async def analyze_data(
75
  df_head = df.head().to_string()
76
  df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
77
 
78
- system_prompt = """
79
  You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
80
 
81
- **URGENT AND CRITICAL INSTRUCTIONS:**
82
- - The pandas DataFrame `df` is ALREADY in memory.
83
- - The pandas library is ALREADY imported as `pd`.
84
- - The regex library is ALREADY imported as `re`.
85
- - DO NOT include any `import` statements in your code.
86
- - DO NOT write any code to read or load data.
87
- - Your entire output must be ONLY the raw Python code. No markdown, no comments, no explanations.
88
-
89
- **Your script MUST:**
90
- 1. Perform data cleaning on the `df` DataFrame first.
91
- 2. For EACH question the user asks, you MUST `print()` the final answer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  """
93
  user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
94
 
 
75
  df_head = df.head().to_string()
76
  df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
77
 
78
+ system_prompt ="""
79
  You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
80
 
81
+ **AVAILABLE LIBRARIES:**
82
+ The following libraries are ALREADY IMPORTED and available for you to use:
83
+ - `pandas` as `pd`
84
+ - `re`
85
+ - `matplotlib.pyplot` as `plt`
86
+ - `seaborn` as `sns`
87
+ - `numpy` as `np`
88
+ - `io`
89
+ - `base64`
90
+ - `sklearn.linear_model.LinearRegression`
91
+
92
+ **CRITICAL INSTRUCTIONS:**
93
+ - DO NOT include any `import` statements.
94
+ - The DataFrame `df` is ALREADY in memory. DO NOT load data.
95
+ - Your entire output MUST BE ONLY raw Python code. No markdown or explanations.
96
+
97
+ **YOUR SCRIPT MUST:**
98
+ 1. First, perform data cleaning on the `df` DataFrame.
99
+ 2. For any text-based or calculation questions, `print()` the final answer.
100
+ 3. If asked to draw a plot, you MUST generate the plot and print it as a base64 encoded data URI. DO NOT show the plot. Follow this EXACT recipe:
101
+ ```python
102
+ # --- START PLOT RECIPE ---
103
+ fig, ax = plt.subplots()
104
+ # ... your plotting code using 'ax', e.g., sns.scatterplot(ax=ax, ...) ...
105
+
106
+ # Save the plot to an in-memory buffer
107
+ buf = io.BytesIO()
108
+ fig.savefig(buf, format='png', bbox_inches='tight')
109
+ buf.seek(0)
110
+
111
+ # Encode the buffer to a base64 string
112
+ image_base64 = base64.b64encode(buf.read()).decode('utf-8')
113
+
114
+ # Print the data URI
115
+ print(f"data:image/png;base64,{image_base64}")
116
+ # --- END PLOT RECIPE ---
117
+ ```
118
  """
119
  user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
120
 
requirements.txt CHANGED
@@ -7,4 +7,7 @@ beautifulsoup4 # Add this line for parsing HTML
7
  lxml # Add this line, it's a fast parser for BeautifulSoup
8
  pandas
9
  numpy
10
- playwright
 
 
 
 
7
  lxml # Add this line, it's a fast parser for BeautifulSoup
8
  pandas
9
  numpy
10
+ playwright
11
+ matplotlib # For plotting
12
+ seaborn # For statistical plots
13
+ scikit-learn # For regression calculations
tools.py CHANGED
@@ -9,6 +9,17 @@ import re
9
  import io
10
  import sys
11
  from contextlib import redirect_stdout
 
 
 
 
 
 
 
 
 
 
 
12
  client = None
13
  def set_openai_client(c):
14
  global client
@@ -90,30 +101,36 @@ def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataF
90
 
91
  def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
92
  """
93
- Executes Python code with a DataFrame named 'df' available in the local scope.
94
  Captures and returns any output printed to stdout.
95
  """
96
- # Create a string stream to capture stdout
97
  output_stream = io.StringIO()
98
 
99
- # Create a local scope for the exec to run in, with 'df' pre-populated
100
  local_scope = {
101
  'df': df,
102
  'pd': pd,
103
- 're': re
 
 
 
 
 
 
104
  }
105
 
106
  try:
107
- # Redirect stdout to our stream
108
  with redirect_stdout(output_stream):
109
- # Execute the code in the defined scope
110
  exec(python_code, {'__builtins__': __builtins__}, local_scope)
111
 
112
- # Get the captured output
 
 
113
  result = output_stream.getvalue()
114
  if not result:
115
  return "Code executed successfully with no printed output."
116
  return result
117
 
118
  except Exception as e:
119
- return f"Error executing code: {e}\n---\nCode that failed:\n{python_code}"
 
 
9
  import io
10
  import sys
11
  from contextlib import redirect_stdout
12
+
13
+
14
+ import matplotlib.pyplot as plt
15
+ import seaborn as sns
16
+ from sklearn.linear_model import LinearRegression
17
+ import numpy as np
18
+ import base64
19
+
20
+
21
+
22
+
23
  client = None
24
  def set_openai_client(c):
25
  global client
 
101
 
102
  def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
103
  """
104
+ Executes Python code with a DataFrame and common libraries available.
105
  Captures and returns any output printed to stdout.
106
  """
 
107
  output_stream = io.StringIO()
108
 
109
+ # --- ADD THE NEW LIBRARIES TO THE SCOPE ---
110
  local_scope = {
111
  'df': df,
112
  'pd': pd,
113
+ 're': re,
114
+ 'plt': plt, # Matplotlib
115
+ 'sns': sns, # Seaborn
116
+ 'np': np, # NumPy
117
+ 'LinearRegression': LinearRegression, # Scikit-learn
118
+ 'io': io, # IO for in-memory files
119
+ 'base64': base64 # Base64 for encoding
120
  }
121
 
122
  try:
 
123
  with redirect_stdout(output_stream):
 
124
  exec(python_code, {'__builtins__': __builtins__}, local_scope)
125
 
126
+ # After execution, close any open matplotlib plots to free up memory
127
+ plt.close('all')
128
+
129
  result = output_stream.getvalue()
130
  if not result:
131
  return "Code executed successfully with no printed output."
132
  return result
133
 
134
  except Exception as e:
135
+ plt.close('all') # Also close plots on error
136
+ return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"