Spaces:
Runtime error
Runtime error
| # --- Import Libraries --- | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| from transformers import pipeline | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import openai | |
| import os | |
| import json | |
| import re | |
| # Import Langchain components | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_openai import ChatOpenAI | |
| from dotenv import load_dotenv | |
| # --- Environment Setup --- | |
| load_dotenv() | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| # Ensure the API key is set for libraries that rely on this convention | |
| os.environ["OPENAI_API_KEY"] = api_key | |
| # --- Model and Problem Definition --- | |
| # Define the code generation models to compare | |
| models = [ | |
| "Salesforce/codegen-350m-mono", | |
| "HuggingFaceTB/SmolLM-360M", | |
| "EleutherAI/gpt-neo-125M" | |
| ] | |
| # Create the Prompt Template for evaluation using an OpenAI model (e.g., gpt-4o) | |
| model = ChatOpenAI(model_name="gpt-4o", temperature=0) | |
| # Define a code generation problem for Testing our Evaluation Framework | |
| problem_statement = "Write a Python code that finds the longest word in a sentence." | |
| # --- Helper Functions --- | |
| def extract_json_from_evaluation(evaluation_text): | |
| """ | |
| Extracts the JSON object from the given evaluation text using a regular expression. | |
| Parameters: | |
| evaluation_text (str): The text containing the evaluation, including the JSON object. | |
| Returns: | |
| dict: The extracted JSON object as a dictionary, or None on failure. | |
| """ | |
| import re | |
| import json | |
| # Use regular expression to find the JSON object within the text, enclosed in ```json ... ``` | |
| # re.DOTALL is important to allow the '.' to match newlines | |
| match = re.search(r'```json\n?(.*?)\n?```', evaluation_text, re.DOTALL) | |
| if match: | |
| json_str = match.group(1).strip() | |
| try: | |
| # Parse the JSON string into a dictionary | |
| json_data = json.loads(json_str) | |
| return json_data | |
| except json.JSONDecodeError: | |
| print("Error: Failed to decode JSON.") | |
| return None | |
| else: | |
| print("Error: No JSON object found in the text.") | |
| return None | |
| def evaluate_code(question, code): | |
| """ | |
| Function to evaluate generated code using OpenAI GPT API (placeholder). | |
| The evaluation model provides scores on several criteria in a JSON format. | |
| Args: | |
| question (str): The coding problem statement. | |
| code (str): The generated Python code. | |
| Returns: | |
| dict: The extracted scores, or None if evaluation fails. | |
| """ | |
| promptstr = f''' | |
| You are a code reviewer who evaluates a given Python Code against a given Problem. | |
| The coding problem is as follows: {question} | |
| Evaluate the following Python Code for correctness and quality against the given problem: | |
| {code} | |
| Provide scores on a scale of 1 to 5 for the following criteria: | |
| 1. Correctness: How correct is the code in terms of logic and output against the given problem? | |
| 2. Efficiency: How efficient is the solution in terms of execution? | |
| 3. Readability: How readable and well-structured is the code? | |
| 4. Best Practices: How well does the code follow coding best practices? | |
| 5. Comments: How well are the code and logic explained with comments? | |
| Return only as a JSON object with the criteria and the scores enclosed in with ```json ... ``` tag and nothing else. | |
| ''' | |
| # Use LangChain to invoke the model | |
| # Note: The original images used a placeholder `chain` object and `ChatPromptTemplate`. | |
| # I'll simplify the direct prompt passing for this compilation, assuming 'model' is the ChatOpenAI instance. | |
| # Define a simple template for direct text passing to the model | |
| template = ChatPromptTemplate.from_messages([ | |
| ("system", "You are a helpful code reviewer that responds only with a JSON object enclosed in ```json ... ``` tags."), | |
| ("user", "{prompt_text}") | |
| ]) | |
| chain = template | model | |
| response = chain.invoke({"prompt_text": promptstr}) | |
| print("-" * 30 + " GENERATED EVALUATION " + "-" * 30) | |
| print(response.content.strip()) | |
| print("-" * 80) | |
| # Extract the scores from the response | |
| scores = extract_json_from_evaluation(response.content.strip()) | |
| return scores | |
| def visualize_scores(evaluation_results): | |
| """ | |
| Visualizes the evaluation scores for different models using a grouped bar chart. | |
| Args: | |
| evaluation_results (list): A list of dictionaries, where each dict contains | |
| model results, including 'Scores'. | |
| """ | |
| # Extract the criteria (assuming all models have the same set) | |
| if not evaluation_results: | |
| print("No results to visualize.") | |
| return | |
| criteria = list(evaluation_results[0]['Scores'].keys()) | |
| num_criteria = len(criteria) | |
| num_models = len(evaluation_results) | |
| bar_width = 0.2 | |
| # Generate a color map for different models | |
| colors = plt.cm.viridis(np.linspace(0, 1, num_models)) | |
| # Set up the bar chart | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| # Generate bars for each model | |
| for i, result in enumerate(evaluation_results): | |
| # Extract scores in the order of criteria | |
| model_scores = [result['Scores'][c] for c in criteria] | |
| # Calculate bar positions | |
| # np.arange(num_criteria) gives [0, 1, 2, ...] | |
| # bar_width * i shifts the group of bars for the current model | |
| bar_positions = np.arange(num_criteria) + bar_width * i | |
| ax.bar(bar_positions, model_scores, bar_width, label=f'Model {i + 1} - {result.get("model_name", "Unknown")}', color=colors[i]) | |
| # Set chart labels and title | |
| ax.set_xlabel('Evaluation Criteria') | |
| ax.set_ylabel('Scores (1 to 5)') | |
| ax.set_title('Evaluation Scores for Code Generation Models') | |
| # Set X-axis ticks to be centered under the groups of bars | |
| ax.set_xticks(np.arange(num_criteria) + bar_width * (num_models / 2 - 0.5)) | |
| ax.set_xticklabels(criteria, rotation=45, ha='right') | |
| ax.legend() | |
| # Display the chart | |
| plt.tight_layout() | |
| plt.show() | |
| # --- Main Evaluation Loop --- | |
| print("Starting LLM Code Generation and Evaluation...") | |
| results = [] | |
| for model_name in models: | |
| print("\n" + "=" * 80) | |
| print(f"Evaluating Model: {model_name}") | |
| print("=" * 80) | |
| # Load the text-generation pipeline for the current model | |
| # device=-1 indicates using CPU (change to 0 or other for specific GPU) | |
| generator = pipeline("text-generation", model=model_name, device=-1) | |
| # Generate code | |
| # We pass the problem statement directly as the prompt | |
| generated_code_output = generator(problem_statement, max_length=200, do_sample=False) | |
| # The output is typically a list of dicts: [{'generated_text': '...'}] | |
| generated_code = generated_code_output[0]['generated_text'].replace(problem_statement, "").strip() | |
| print("-" * 30 + " GENERATED CODE " + "-" * 30) | |
| print(f"\n{generated_code}\n") | |
| print("-" * 76) | |
| # Evaluate code | |
| evaluation_scores = evaluate_code(problem_statement, generated_code) | |
| # Append the result | |
| if evaluation_scores: | |
| results.append({ | |
| "model_name": model_name, | |
| "Scores": evaluation_scores | |
| }) | |
| else: | |
| print(f"Skipping model {model_name} due to failed evaluation.") | |
| # --- Visualization of Evaluation Results --- | |
| print("\n" + "=" * 80) | |
| print("Final Evaluation Results:") | |
| print("=" * 80) | |
| print(results) | |
| # Visualize the scores | |
| if results: | |
| visualize_scores(results) | |
| else: | |
| print("No valid results to visualize.") |