Spaces:
Runtime error
Runtime error
File size: 7,553 Bytes
19f66bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# --- Import Libraries ---
import warnings
warnings.filterwarnings("ignore")
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import openai
import os
import json
import re
# Import Langchain components
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
# --- Environment Setup ---
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Ensure the API key is set for libraries that rely on this convention
os.environ["OPENAI_API_KEY"] = api_key
# --- Model and Problem Definition ---
# Define the code generation models to compare
models = [
"Salesforce/codegen-350m-mono",
"HuggingFaceTB/SmolLM-360M",
"EleutherAI/gpt-neo-125M"
]
# Create the Prompt Template for evaluation using an OpenAI model (e.g., gpt-4o)
model = ChatOpenAI(model_name="gpt-4o", temperature=0)
# Define a code generation problem for Testing our Evaluation Framework
problem_statement = "Write a Python code that finds the longest word in a sentence."
# --- Helper Functions ---
def extract_json_from_evaluation(evaluation_text):
"""
Extracts the JSON object from the given evaluation text using a regular expression.
Parameters:
evaluation_text (str): The text containing the evaluation, including the JSON object.
Returns:
dict: The extracted JSON object as a dictionary, or None on failure.
"""
import re
import json
# Use regular expression to find the JSON object within the text, enclosed in ```json ... ```
# re.DOTALL is important to allow the '.' to match newlines
match = re.search(r'```json\n?(.*?)\n?```', evaluation_text, re.DOTALL)
if match:
json_str = match.group(1).strip()
try:
# Parse the JSON string into a dictionary
json_data = json.loads(json_str)
return json_data
except json.JSONDecodeError:
print("Error: Failed to decode JSON.")
return None
else:
print("Error: No JSON object found in the text.")
return None
def evaluate_code(question, code):
"""
Function to evaluate generated code using OpenAI GPT API (placeholder).
The evaluation model provides scores on several criteria in a JSON format.
Args:
question (str): The coding problem statement.
code (str): The generated Python code.
Returns:
dict: The extracted scores, or None if evaluation fails.
"""
promptstr = f'''
You are a code reviewer who evaluates a given Python Code against a given Problem.
The coding problem is as follows: {question}
Evaluate the following Python Code for correctness and quality against the given problem:
{code}
Provide scores on a scale of 1 to 5 for the following criteria:
1. Correctness: How correct is the code in terms of logic and output against the given problem?
2. Efficiency: How efficient is the solution in terms of execution?
3. Readability: How readable and well-structured is the code?
4. Best Practices: How well does the code follow coding best practices?
5. Comments: How well are the code and logic explained with comments?
Return only as a JSON object with the criteria and the scores enclosed in with ```json ... ``` tag and nothing else.
'''
# Use LangChain to invoke the model
# Note: The original images used a placeholder `chain` object and `ChatPromptTemplate`.
# I'll simplify the direct prompt passing for this compilation, assuming 'model' is the ChatOpenAI instance.
# Define a simple template for direct text passing to the model
template = ChatPromptTemplate.from_messages([
("system", "You are a helpful code reviewer that responds only with a JSON object enclosed in ```json ... ``` tags."),
("user", "{prompt_text}")
])
chain = template | model
response = chain.invoke({"prompt_text": promptstr})
print("-" * 30 + " GENERATED EVALUATION " + "-" * 30)
print(response.content.strip())
print("-" * 80)
# Extract the scores from the response
scores = extract_json_from_evaluation(response.content.strip())
return scores
def visualize_scores(evaluation_results):
"""
Visualizes the evaluation scores for different models using a grouped bar chart.
Args:
evaluation_results (list): A list of dictionaries, where each dict contains
model results, including 'Scores'.
"""
# Extract the criteria (assuming all models have the same set)
if not evaluation_results:
print("No results to visualize.")
return
criteria = list(evaluation_results[0]['Scores'].keys())
num_criteria = len(criteria)
num_models = len(evaluation_results)
bar_width = 0.2
# Generate a color map for different models
colors = plt.cm.viridis(np.linspace(0, 1, num_models))
# Set up the bar chart
fig, ax = plt.subplots(figsize=(12, 6))
# Generate bars for each model
for i, result in enumerate(evaluation_results):
# Extract scores in the order of criteria
model_scores = [result['Scores'][c] for c in criteria]
# Calculate bar positions
# np.arange(num_criteria) gives [0, 1, 2, ...]
# bar_width * i shifts the group of bars for the current model
bar_positions = np.arange(num_criteria) + bar_width * i
ax.bar(bar_positions, model_scores, bar_width, label=f'Model {i + 1} - {result.get("model_name", "Unknown")}', color=colors[i])
# Set chart labels and title
ax.set_xlabel('Evaluation Criteria')
ax.set_ylabel('Scores (1 to 5)')
ax.set_title('Evaluation Scores for Code Generation Models')
# Set X-axis ticks to be centered under the groups of bars
ax.set_xticks(np.arange(num_criteria) + bar_width * (num_models / 2 - 0.5))
ax.set_xticklabels(criteria, rotation=45, ha='right')
ax.legend()
# Display the chart
plt.tight_layout()
plt.show()
# --- Main Evaluation Loop ---
print("Starting LLM Code Generation and Evaluation...")
results = []
for model_name in models:
print("\n" + "=" * 80)
print(f"Evaluating Model: {model_name}")
print("=" * 80)
# Load the text-generation pipeline for the current model
# device=-1 indicates using CPU (change to 0 or other for specific GPU)
generator = pipeline("text-generation", model=model_name, device=-1)
# Generate code
# We pass the problem statement directly as the prompt
generated_code_output = generator(problem_statement, max_length=200, do_sample=False)
# The output is typically a list of dicts: [{'generated_text': '...'}]
generated_code = generated_code_output[0]['generated_text'].replace(problem_statement, "").strip()
print("-" * 30 + " GENERATED CODE " + "-" * 30)
print(f"\n{generated_code}\n")
print("-" * 76)
# Evaluate code
evaluation_scores = evaluate_code(problem_statement, generated_code)
# Append the result
if evaluation_scores:
results.append({
"model_name": model_name,
"Scores": evaluation_scores
})
else:
print(f"Skipping model {model_name} due to failed evaluation.")
# --- Visualization of Evaluation Results ---
print("\n" + "=" * 80)
print("Final Evaluation Results:")
print("=" * 80)
print(results)
# Visualize the scores
if results:
visualize_scores(results)
else:
print("No valid results to visualize.") |