File size: 7,553 Bytes
19f66bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# --- Import Libraries ---
import warnings
warnings.filterwarnings("ignore")
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import openai
import os
import json
import re

# Import Langchain components
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# --- Environment Setup ---
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Ensure the API key is set for libraries that rely on this convention
os.environ["OPENAI_API_KEY"] = api_key 

# --- Model and Problem Definition ---

# Define the code generation models to compare
models = [
    "Salesforce/codegen-350m-mono",
    "HuggingFaceTB/SmolLM-360M",
    "EleutherAI/gpt-neo-125M"
]

# Create the Prompt Template for evaluation using an OpenAI model (e.g., gpt-4o)
model = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Define a code generation problem for Testing our Evaluation Framework
problem_statement = "Write a Python code that finds the longest word in a sentence."

# --- Helper Functions ---

def extract_json_from_evaluation(evaluation_text):
    """
    Extracts the JSON object from the given evaluation text using a regular expression.
    
    Parameters:
    evaluation_text (str): The text containing the evaluation, including the JSON object.

    Returns:
    dict: The extracted JSON object as a dictionary, or None on failure.
    """
    import re
    import json
    
    # Use regular expression to find the JSON object within the text, enclosed in ```json ... ```
    # re.DOTALL is important to allow the '.' to match newlines
    match = re.search(r'```json\n?(.*?)\n?```', evaluation_text, re.DOTALL)
    
    if match:
        json_str = match.group(1).strip()
        try:
            # Parse the JSON string into a dictionary
            json_data = json.loads(json_str)
            return json_data
        except json.JSONDecodeError:
            print("Error: Failed to decode JSON.")
            return None
    else:
        print("Error: No JSON object found in the text.")
        return None

def evaluate_code(question, code):
    """
    Function to evaluate generated code using OpenAI GPT API (placeholder).
    The evaluation model provides scores on several criteria in a JSON format.
    
    Args:
        question (str): The coding problem statement.
        code (str): The generated Python code.
        
    Returns:
        dict: The extracted scores, or None if evaluation fails.
    """
    promptstr = f'''
You are a code reviewer who evaluates a given Python Code against a given Problem.
The coding problem is as follows: {question}
Evaluate the following Python Code for correctness and quality against the given problem: 
{code}
Provide scores on a scale of 1 to 5 for the following criteria:
1. Correctness: How correct is the code in terms of logic and output against the given problem?
2. Efficiency: How efficient is the solution in terms of execution?
3. Readability: How readable and well-structured is the code?
4. Best Practices: How well does the code follow coding best practices?
5. Comments: How well are the code and logic explained with comments?
Return only as a JSON object with the criteria and the scores enclosed in with ```json ... ``` tag and nothing else.
'''
    
    # Use LangChain to invoke the model
    # Note: The original images used a placeholder `chain` object and `ChatPromptTemplate`.
    # I'll simplify the direct prompt passing for this compilation, assuming 'model' is the ChatOpenAI instance.
    
    # Define a simple template for direct text passing to the model
    template = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful code reviewer that responds only with a JSON object enclosed in ```json ... ``` tags."),
        ("user", "{prompt_text}")
    ])
    
    chain = template | model
    
    response = chain.invoke({"prompt_text": promptstr})
    
    print("-" * 30 + " GENERATED EVALUATION " + "-" * 30)
    print(response.content.strip())
    print("-" * 80)

    # Extract the scores from the response
    scores = extract_json_from_evaluation(response.content.strip())
    
    return scores

def visualize_scores(evaluation_results):
    """
    Visualizes the evaluation scores for different models using a grouped bar chart.
    
    Args:
        evaluation_results (list): A list of dictionaries, where each dict contains 
                                   model results, including 'Scores'.
    """
    # Extract the criteria (assuming all models have the same set)
    if not evaluation_results:
        print("No results to visualize.")
        return

    criteria = list(evaluation_results[0]['Scores'].keys())
    num_criteria = len(criteria)
    num_models = len(evaluation_results)
    bar_width = 0.2
    
    # Generate a color map for different models
    colors = plt.cm.viridis(np.linspace(0, 1, num_models))

    # Set up the bar chart
    fig, ax = plt.subplots(figsize=(12, 6))

    # Generate bars for each model
    for i, result in enumerate(evaluation_results):
        # Extract scores in the order of criteria
        model_scores = [result['Scores'][c] for c in criteria]
        
        # Calculate bar positions
        # np.arange(num_criteria) gives [0, 1, 2, ...]
        # bar_width * i shifts the group of bars for the current model
        bar_positions = np.arange(num_criteria) + bar_width * i
        
        ax.bar(bar_positions, model_scores, bar_width, label=f'Model {i + 1} - {result.get("model_name", "Unknown")}', color=colors[i])

    # Set chart labels and title
    ax.set_xlabel('Evaluation Criteria')
    ax.set_ylabel('Scores (1 to 5)')
    ax.set_title('Evaluation Scores for Code Generation Models')
    
    # Set X-axis ticks to be centered under the groups of bars
    ax.set_xticks(np.arange(num_criteria) + bar_width * (num_models / 2 - 0.5))
    ax.set_xticklabels(criteria, rotation=45, ha='right')
    
    ax.legend()
    
    # Display the chart
    plt.tight_layout()
    plt.show()

# --- Main Evaluation Loop ---

print("Starting LLM Code Generation and Evaluation...")
results = []
for model_name in models:
    print("\n" + "=" * 80)
    print(f"Evaluating Model: {model_name}")
    print("=" * 80)

    # Load the text-generation pipeline for the current model
    # device=-1 indicates using CPU (change to 0 or other for specific GPU)
    generator = pipeline("text-generation", model=model_name, device=-1)

    # Generate code
    # We pass the problem statement directly as the prompt
    generated_code_output = generator(problem_statement, max_length=200, do_sample=False)
    
    # The output is typically a list of dicts: [{'generated_text': '...'}]
    generated_code = generated_code_output[0]['generated_text'].replace(problem_statement, "").strip()

    print("-" * 30 + " GENERATED CODE " + "-" * 30)
    print(f"\n{generated_code}\n")
    print("-" * 76)

    # Evaluate code
    evaluation_scores = evaluate_code(problem_statement, generated_code)

    # Append the result
    if evaluation_scores:
        results.append({
            "model_name": model_name,
            "Scores": evaluation_scores
        })
    else:
        print(f"Skipping model {model_name} due to failed evaluation.")

# --- Visualization of Evaluation Results ---

print("\n" + "=" * 80)
print("Final Evaluation Results:")
print("=" * 80)

print(results)

# Visualize the scores
if results:
    visualize_scores(results)
else:
    print("No valid results to visualize.")