# --- Import Libraries --- import warnings warnings.filterwarnings("ignore") from transformers import pipeline import matplotlib.pyplot as plt import numpy as np import openai import os import json import re # Import Langchain components from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI from dotenv import load_dotenv # --- Environment Setup --- load_dotenv() api_key = os.getenv("OPENAI_API_KEY") # Ensure the API key is set for libraries that rely on this convention os.environ["OPENAI_API_KEY"] = api_key # --- Model and Problem Definition --- # Define the code generation models to compare models = [ "Salesforce/codegen-350m-mono", "HuggingFaceTB/SmolLM-360M", "EleutherAI/gpt-neo-125M" ] # Create the Prompt Template for evaluation using an OpenAI model (e.g., gpt-4o) model = ChatOpenAI(model_name="gpt-4o", temperature=0) # Define a code generation problem for Testing our Evaluation Framework problem_statement = "Write a Python code that finds the longest word in a sentence." # --- Helper Functions --- def extract_json_from_evaluation(evaluation_text): """ Extracts the JSON object from the given evaluation text using a regular expression. Parameters: evaluation_text (str): The text containing the evaluation, including the JSON object. Returns: dict: The extracted JSON object as a dictionary, or None on failure. """ import re import json # Use regular expression to find the JSON object within the text, enclosed in ```json ... ``` # re.DOTALL is important to allow the '.' to match newlines match = re.search(r'```json\n?(.*?)\n?```', evaluation_text, re.DOTALL) if match: json_str = match.group(1).strip() try: # Parse the JSON string into a dictionary json_data = json.loads(json_str) return json_data except json.JSONDecodeError: print("Error: Failed to decode JSON.") return None else: print("Error: No JSON object found in the text.") return None def evaluate_code(question, code): """ Function to evaluate generated code using OpenAI GPT API (placeholder). The evaluation model provides scores on several criteria in a JSON format. Args: question (str): The coding problem statement. code (str): The generated Python code. Returns: dict: The extracted scores, or None if evaluation fails. """ promptstr = f''' You are a code reviewer who evaluates a given Python Code against a given Problem. The coding problem is as follows: {question} Evaluate the following Python Code for correctness and quality against the given problem: {code} Provide scores on a scale of 1 to 5 for the following criteria: 1. Correctness: How correct is the code in terms of logic and output against the given problem? 2. Efficiency: How efficient is the solution in terms of execution? 3. Readability: How readable and well-structured is the code? 4. Best Practices: How well does the code follow coding best practices? 5. Comments: How well are the code and logic explained with comments? Return only as a JSON object with the criteria and the scores enclosed in with ```json ... ``` tag and nothing else. ''' # Use LangChain to invoke the model # Note: The original images used a placeholder `chain` object and `ChatPromptTemplate`. # I'll simplify the direct prompt passing for this compilation, assuming 'model' is the ChatOpenAI instance. # Define a simple template for direct text passing to the model template = ChatPromptTemplate.from_messages([ ("system", "You are a helpful code reviewer that responds only with a JSON object enclosed in ```json ... ``` tags."), ("user", "{prompt_text}") ]) chain = template | model response = chain.invoke({"prompt_text": promptstr}) print("-" * 30 + " GENERATED EVALUATION " + "-" * 30) print(response.content.strip()) print("-" * 80) # Extract the scores from the response scores = extract_json_from_evaluation(response.content.strip()) return scores def visualize_scores(evaluation_results): """ Visualizes the evaluation scores for different models using a grouped bar chart. Args: evaluation_results (list): A list of dictionaries, where each dict contains model results, including 'Scores'. """ # Extract the criteria (assuming all models have the same set) if not evaluation_results: print("No results to visualize.") return criteria = list(evaluation_results[0]['Scores'].keys()) num_criteria = len(criteria) num_models = len(evaluation_results) bar_width = 0.2 # Generate a color map for different models colors = plt.cm.viridis(np.linspace(0, 1, num_models)) # Set up the bar chart fig, ax = plt.subplots(figsize=(12, 6)) # Generate bars for each model for i, result in enumerate(evaluation_results): # Extract scores in the order of criteria model_scores = [result['Scores'][c] for c in criteria] # Calculate bar positions # np.arange(num_criteria) gives [0, 1, 2, ...] # bar_width * i shifts the group of bars for the current model bar_positions = np.arange(num_criteria) + bar_width * i ax.bar(bar_positions, model_scores, bar_width, label=f'Model {i + 1} - {result.get("model_name", "Unknown")}', color=colors[i]) # Set chart labels and title ax.set_xlabel('Evaluation Criteria') ax.set_ylabel('Scores (1 to 5)') ax.set_title('Evaluation Scores for Code Generation Models') # Set X-axis ticks to be centered under the groups of bars ax.set_xticks(np.arange(num_criteria) + bar_width * (num_models / 2 - 0.5)) ax.set_xticklabels(criteria, rotation=45, ha='right') ax.legend() # Display the chart plt.tight_layout() plt.show() # --- Main Evaluation Loop --- print("Starting LLM Code Generation and Evaluation...") results = [] for model_name in models: print("\n" + "=" * 80) print(f"Evaluating Model: {model_name}") print("=" * 80) # Load the text-generation pipeline for the current model # device=-1 indicates using CPU (change to 0 or other for specific GPU) generator = pipeline("text-generation", model=model_name, device=-1) # Generate code # We pass the problem statement directly as the prompt generated_code_output = generator(problem_statement, max_length=200, do_sample=False) # The output is typically a list of dicts: [{'generated_text': '...'}] generated_code = generated_code_output[0]['generated_text'].replace(problem_statement, "").strip() print("-" * 30 + " GENERATED CODE " + "-" * 30) print(f"\n{generated_code}\n") print("-" * 76) # Evaluate code evaluation_scores = evaluate_code(problem_statement, generated_code) # Append the result if evaluation_scores: results.append({ "model_name": model_name, "Scores": evaluation_scores }) else: print(f"Skipping model {model_name} due to failed evaluation.") # --- Visualization of Evaluation Results --- print("\n" + "=" * 80) print("Final Evaluation Results:") print("=" * 80) print(results) # Visualize the scores if results: visualize_scores(results) else: print("No valid results to visualize.")