Spaces:

Karthix1
/

data-analyst-agent

Sleeping

App Files Files Community

KarthikMuraliM commited on Sep 4

Commit

98933b3

1 Parent(s): 4fda5be

Feat: Add data visualization capabilities

Browse files

Files changed (3) hide show

app.py +38 -12
requirements.txt +4 -1
tools.py +25 -8

app.py CHANGED Viewed

@@ -75,20 +75,46 @@ async def analyze_data(
         df_head = df.head().to_string()
         df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
-        system_prompt = """
         You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
-        **URGENT AND CRITICAL INSTRUCTIONS:**
-        - The pandas DataFrame `df` is ALREADY in memory.
-        - The pandas library is ALREADY imported as `pd`.
-        - The regex library is ALREADY imported as `re`.
-        - DO NOT include any `import` statements in your code.
-        - DO NOT write any code to read or load data.
-        - Your entire output must be ONLY the raw Python code. No markdown, no comments, no explanations.
-        **Your script MUST:**
-        1.  Perform data cleaning on the `df` DataFrame first.
-        2.  For EACH question the user asks, you MUST `print()` the final answer.
         """
         user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"

         df_head = df.head().to_string()
         df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
+        system_prompt ="""
         You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
+        **AVAILABLE LIBRARIES:**
+        The following libraries are ALREADY IMPORTED and available for you to use:
+        - `pandas` as `pd`
+        - `re`
+        - `matplotlib.pyplot` as `plt`
+        - `seaborn` as `sns`
+        - `numpy` as `np`
+        - `io`
+        - `base64`
+        - `sklearn.linear_model.LinearRegression`
+        **CRITICAL INSTRUCTIONS:**
+        - DO NOT include any `import` statements.
+        - The DataFrame `df` is ALREADY in memory. DO NOT load data.
+        - Your entire output MUST BE ONLY raw Python code. No markdown or explanations.
+        **YOUR SCRIPT MUST:**
+        1.  First, perform data cleaning on the `df` DataFrame.
+        2.  For any text-based or calculation questions, `print()` the final answer.
+        3.  If asked to draw a plot, you MUST generate the plot and print it as a base64 encoded data URI. DO NOT show the plot. Follow this EXACT recipe:
+            ```python
+            # --- START PLOT RECIPE ---
+            fig, ax = plt.subplots()
+            # ... your plotting code using 'ax', e.g., sns.scatterplot(ax=ax, ...) ...
+            # Save the plot to an in-memory buffer
+            buf = io.BytesIO()
+            fig.savefig(buf, format='png', bbox_inches='tight')
+            buf.seek(0)
+            # Encode the buffer to a base64 string
+            image_base64 = base64.b64encode(buf.read()).decode('utf-8')
+            # Print the data URI
+            print(f"data:image/png;base64,{image_base64}")
+            # --- END PLOT RECIPE ---
+            ```
         """
         user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"

requirements.txt CHANGED Viewed

@@ -7,4 +7,7 @@ beautifulsoup4    # Add this line for parsing HTML
 lxml              # Add this line, it's a fast parser for BeautifulSoup
 pandas
 numpy
-playwright

 lxml              # Add this line, it's a fast parser for BeautifulSoup
 pandas
 numpy
+playwright
+matplotlib  # For plotting
+seaborn     # For statistical plots
+scikit-learn # For regression calculations

tools.py CHANGED Viewed

@@ -9,6 +9,17 @@ import re
 import io
 import sys
 from contextlib import redirect_stdout
 client = None
 def set_openai_client(c):
     global client
@@ -90,30 +101,36 @@ def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataF
 def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
     """
-    Executes Python code with a DataFrame named 'df' available in the local scope.
     Captures and returns any output printed to stdout.
     """
-    # Create a string stream to capture stdout
     output_stream = io.StringIO()
-    # Create a local scope for the exec to run in, with 'df' pre-populated
     local_scope = {
         'df': df,
         'pd': pd,
-        're': re
     }
     try:
-        # Redirect stdout to our stream
         with redirect_stdout(output_stream):
-            # Execute the code in the defined scope
             exec(python_code, {'__builtins__': __builtins__}, local_scope)
-        # Get the captured output
         result = output_stream.getvalue()
         if not result:
             return "Code executed successfully with no printed output."
         return result
     except Exception as e:
-        return f"Error executing code: {e}\n---\nCode that failed:\n{python_code}"

 import io
 import sys
 from contextlib import redirect_stdout
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.linear_model import LinearRegression
+import numpy as np
+import base64
 client = None
 def set_openai_client(c):
     global client
 def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
     """
+    Executes Python code with a DataFrame and common libraries available.
     Captures and returns any output printed to stdout.
     """
     output_stream = io.StringIO()
+    # --- ADD THE NEW LIBRARIES TO THE SCOPE ---
     local_scope = {
         'df': df,
         'pd': pd,
+        're': re,
+        'plt': plt,          # Matplotlib
+        'sns': sns,          # Seaborn
+        'np': np,            # NumPy
+        'LinearRegression': LinearRegression, # Scikit-learn
+        'io': io,            # IO for in-memory files
+        'base64': base64     # Base64 for encoding
     }
     try:
         with redirect_stdout(output_stream):
             exec(python_code, {'__builtins__': __builtins__}, local_scope)
+        # After execution, close any open matplotlib plots to free up memory
+        plt.close('all')
         result = output_stream.getvalue()
         if not result:
             return "Code executed successfully with no printed output."
         return result
     except Exception as e:
+        plt.close('all') # Also close plots on error
+        return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"