Spaces:
Sleeping
Sleeping
Commit
·
98933b3
1
Parent(s):
4fda5be
Feat: Add data visualization capabilities
Browse files- app.py +38 -12
- requirements.txt +4 -1
- tools.py +25 -8
app.py
CHANGED
|
@@ -75,20 +75,46 @@ async def analyze_data(
|
|
| 75 |
df_head = df.head().to_string()
|
| 76 |
df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
|
| 77 |
|
| 78 |
-
system_prompt =
|
| 79 |
You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
|
| 80 |
|
| 81 |
-
**
|
| 82 |
-
|
| 83 |
-
-
|
| 84 |
-
-
|
| 85 |
-
-
|
| 86 |
-
-
|
| 87 |
-
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"""
|
| 93 |
user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
|
| 94 |
|
|
|
|
| 75 |
df_head = df.head().to_string()
|
| 76 |
df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
|
| 77 |
|
| 78 |
+
system_prompt ="""
|
| 79 |
You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
|
| 80 |
|
| 81 |
+
**AVAILABLE LIBRARIES:**
|
| 82 |
+
The following libraries are ALREADY IMPORTED and available for you to use:
|
| 83 |
+
- `pandas` as `pd`
|
| 84 |
+
- `re`
|
| 85 |
+
- `matplotlib.pyplot` as `plt`
|
| 86 |
+
- `seaborn` as `sns`
|
| 87 |
+
- `numpy` as `np`
|
| 88 |
+
- `io`
|
| 89 |
+
- `base64`
|
| 90 |
+
- `sklearn.linear_model.LinearRegression`
|
| 91 |
+
|
| 92 |
+
**CRITICAL INSTRUCTIONS:**
|
| 93 |
+
- DO NOT include any `import` statements.
|
| 94 |
+
- The DataFrame `df` is ALREADY in memory. DO NOT load data.
|
| 95 |
+
- Your entire output MUST BE ONLY raw Python code. No markdown or explanations.
|
| 96 |
+
|
| 97 |
+
**YOUR SCRIPT MUST:**
|
| 98 |
+
1. First, perform data cleaning on the `df` DataFrame.
|
| 99 |
+
2. For any text-based or calculation questions, `print()` the final answer.
|
| 100 |
+
3. If asked to draw a plot, you MUST generate the plot and print it as a base64 encoded data URI. DO NOT show the plot. Follow this EXACT recipe:
|
| 101 |
+
```python
|
| 102 |
+
# --- START PLOT RECIPE ---
|
| 103 |
+
fig, ax = plt.subplots()
|
| 104 |
+
# ... your plotting code using 'ax', e.g., sns.scatterplot(ax=ax, ...) ...
|
| 105 |
+
|
| 106 |
+
# Save the plot to an in-memory buffer
|
| 107 |
+
buf = io.BytesIO()
|
| 108 |
+
fig.savefig(buf, format='png', bbox_inches='tight')
|
| 109 |
+
buf.seek(0)
|
| 110 |
+
|
| 111 |
+
# Encode the buffer to a base64 string
|
| 112 |
+
image_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
| 113 |
+
|
| 114 |
+
# Print the data URI
|
| 115 |
+
print(f"data:image/png;base64,{image_base64}")
|
| 116 |
+
# --- END PLOT RECIPE ---
|
| 117 |
+
```
|
| 118 |
"""
|
| 119 |
user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
|
| 120 |
|
requirements.txt
CHANGED
|
@@ -7,4 +7,7 @@ beautifulsoup4 # Add this line for parsing HTML
|
|
| 7 |
lxml # Add this line, it's a fast parser for BeautifulSoup
|
| 8 |
pandas
|
| 9 |
numpy
|
| 10 |
-
playwright
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
lxml # Add this line, it's a fast parser for BeautifulSoup
|
| 8 |
pandas
|
| 9 |
numpy
|
| 10 |
+
playwright
|
| 11 |
+
matplotlib # For plotting
|
| 12 |
+
seaborn # For statistical plots
|
| 13 |
+
scikit-learn # For regression calculations
|
tools.py
CHANGED
|
@@ -9,6 +9,17 @@ import re
|
|
| 9 |
import io
|
| 10 |
import sys
|
| 11 |
from contextlib import redirect_stdout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
client = None
|
| 13 |
def set_openai_client(c):
|
| 14 |
global client
|
|
@@ -90,30 +101,36 @@ def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataF
|
|
| 90 |
|
| 91 |
def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
|
| 92 |
"""
|
| 93 |
-
Executes Python code with a DataFrame
|
| 94 |
Captures and returns any output printed to stdout.
|
| 95 |
"""
|
| 96 |
-
# Create a string stream to capture stdout
|
| 97 |
output_stream = io.StringIO()
|
| 98 |
|
| 99 |
-
#
|
| 100 |
local_scope = {
|
| 101 |
'df': df,
|
| 102 |
'pd': pd,
|
| 103 |
-
're': re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
}
|
| 105 |
|
| 106 |
try:
|
| 107 |
-
# Redirect stdout to our stream
|
| 108 |
with redirect_stdout(output_stream):
|
| 109 |
-
# Execute the code in the defined scope
|
| 110 |
exec(python_code, {'__builtins__': __builtins__}, local_scope)
|
| 111 |
|
| 112 |
-
#
|
|
|
|
|
|
|
| 113 |
result = output_stream.getvalue()
|
| 114 |
if not result:
|
| 115 |
return "Code executed successfully with no printed output."
|
| 116 |
return result
|
| 117 |
|
| 118 |
except Exception as e:
|
| 119 |
-
|
|
|
|
|
|
| 9 |
import io
|
| 10 |
import sys
|
| 11 |
from contextlib import redirect_stdout
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
import seaborn as sns
|
| 16 |
+
from sklearn.linear_model import LinearRegression
|
| 17 |
+
import numpy as np
|
| 18 |
+
import base64
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
client = None
|
| 24 |
def set_openai_client(c):
|
| 25 |
global client
|
|
|
|
| 101 |
|
| 102 |
def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
|
| 103 |
"""
|
| 104 |
+
Executes Python code with a DataFrame and common libraries available.
|
| 105 |
Captures and returns any output printed to stdout.
|
| 106 |
"""
|
|
|
|
| 107 |
output_stream = io.StringIO()
|
| 108 |
|
| 109 |
+
# --- ADD THE NEW LIBRARIES TO THE SCOPE ---
|
| 110 |
local_scope = {
|
| 111 |
'df': df,
|
| 112 |
'pd': pd,
|
| 113 |
+
're': re,
|
| 114 |
+
'plt': plt, # Matplotlib
|
| 115 |
+
'sns': sns, # Seaborn
|
| 116 |
+
'np': np, # NumPy
|
| 117 |
+
'LinearRegression': LinearRegression, # Scikit-learn
|
| 118 |
+
'io': io, # IO for in-memory files
|
| 119 |
+
'base64': base64 # Base64 for encoding
|
| 120 |
}
|
| 121 |
|
| 122 |
try:
|
|
|
|
| 123 |
with redirect_stdout(output_stream):
|
|
|
|
| 124 |
exec(python_code, {'__builtins__': __builtins__}, local_scope)
|
| 125 |
|
| 126 |
+
# After execution, close any open matplotlib plots to free up memory
|
| 127 |
+
plt.close('all')
|
| 128 |
+
|
| 129 |
result = output_stream.getvalue()
|
| 130 |
if not result:
|
| 131 |
return "Code executed successfully with no printed output."
|
| 132 |
return result
|
| 133 |
|
| 134 |
except Exception as e:
|
| 135 |
+
plt.close('all') # Also close plots on error
|
| 136 |
+
return f"Error executing code: {type(e).__name__}: {e}\n---\nCode that failed:\n{python_code}"
|