Alaamat commited on
Commit
19f66bd
·
verified ·
1 Parent(s): ef96f1d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -0
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Import Libraries ---
2
+ import warnings
3
+ warnings.filterwarnings("ignore")
4
+ from transformers import pipeline
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import openai
8
+ import os
9
+ import json
10
+ import re
11
+
12
+ # Import Langchain components
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+ from langchain_openai import ChatOpenAI
15
+ from dotenv import load_dotenv
16
+
17
+ # --- Environment Setup ---
18
+ load_dotenv()
19
+ api_key = os.getenv("OPENAI_API_KEY")
20
+ # Ensure the API key is set for libraries that rely on this convention
21
+ os.environ["OPENAI_API_KEY"] = api_key
22
+
23
+ # --- Model and Problem Definition ---
24
+
25
+ # Define the code generation models to compare
26
+ models = [
27
+ "Salesforce/codegen-350m-mono",
28
+ "HuggingFaceTB/SmolLM-360M",
29
+ "EleutherAI/gpt-neo-125M"
30
+ ]
31
+
32
+ # Create the Prompt Template for evaluation using an OpenAI model (e.g., gpt-4o)
33
+ model = ChatOpenAI(model_name="gpt-4o", temperature=0)
34
+
35
+ # Define a code generation problem for Testing our Evaluation Framework
36
+ problem_statement = "Write a Python code that finds the longest word in a sentence."
37
+
38
+ # --- Helper Functions ---
39
+
40
+ def extract_json_from_evaluation(evaluation_text):
41
+ """
42
+ Extracts the JSON object from the given evaluation text using a regular expression.
43
+
44
+ Parameters:
45
+ evaluation_text (str): The text containing the evaluation, including the JSON object.
46
+
47
+ Returns:
48
+ dict: The extracted JSON object as a dictionary, or None on failure.
49
+ """
50
+ import re
51
+ import json
52
+
53
+ # Use regular expression to find the JSON object within the text, enclosed in ```json ... ```
54
+ # re.DOTALL is important to allow the '.' to match newlines
55
+ match = re.search(r'```json\n?(.*?)\n?```', evaluation_text, re.DOTALL)
56
+
57
+ if match:
58
+ json_str = match.group(1).strip()
59
+ try:
60
+ # Parse the JSON string into a dictionary
61
+ json_data = json.loads(json_str)
62
+ return json_data
63
+ except json.JSONDecodeError:
64
+ print("Error: Failed to decode JSON.")
65
+ return None
66
+ else:
67
+ print("Error: No JSON object found in the text.")
68
+ return None
69
+
70
+ def evaluate_code(question, code):
71
+ """
72
+ Function to evaluate generated code using OpenAI GPT API (placeholder).
73
+ The evaluation model provides scores on several criteria in a JSON format.
74
+
75
+ Args:
76
+ question (str): The coding problem statement.
77
+ code (str): The generated Python code.
78
+
79
+ Returns:
80
+ dict: The extracted scores, or None if evaluation fails.
81
+ """
82
+ promptstr = f'''
83
+ You are a code reviewer who evaluates a given Python Code against a given Problem.
84
+ The coding problem is as follows: {question}
85
+ Evaluate the following Python Code for correctness and quality against the given problem:
86
+ {code}
87
+ Provide scores on a scale of 1 to 5 for the following criteria:
88
+ 1. Correctness: How correct is the code in terms of logic and output against the given problem?
89
+ 2. Efficiency: How efficient is the solution in terms of execution?
90
+ 3. Readability: How readable and well-structured is the code?
91
+ 4. Best Practices: How well does the code follow coding best practices?
92
+ 5. Comments: How well are the code and logic explained with comments?
93
+ Return only as a JSON object with the criteria and the scores enclosed in with ```json ... ``` tag and nothing else.
94
+ '''
95
+
96
+ # Use LangChain to invoke the model
97
+ # Note: The original images used a placeholder `chain` object and `ChatPromptTemplate`.
98
+ # I'll simplify the direct prompt passing for this compilation, assuming 'model' is the ChatOpenAI instance.
99
+
100
+ # Define a simple template for direct text passing to the model
101
+ template = ChatPromptTemplate.from_messages([
102
+ ("system", "You are a helpful code reviewer that responds only with a JSON object enclosed in ```json ... ``` tags."),
103
+ ("user", "{prompt_text}")
104
+ ])
105
+
106
+ chain = template | model
107
+
108
+ response = chain.invoke({"prompt_text": promptstr})
109
+
110
+ print("-" * 30 + " GENERATED EVALUATION " + "-" * 30)
111
+ print(response.content.strip())
112
+ print("-" * 80)
113
+
114
+ # Extract the scores from the response
115
+ scores = extract_json_from_evaluation(response.content.strip())
116
+
117
+ return scores
118
+
119
+ def visualize_scores(evaluation_results):
120
+ """
121
+ Visualizes the evaluation scores for different models using a grouped bar chart.
122
+
123
+ Args:
124
+ evaluation_results (list): A list of dictionaries, where each dict contains
125
+ model results, including 'Scores'.
126
+ """
127
+ # Extract the criteria (assuming all models have the same set)
128
+ if not evaluation_results:
129
+ print("No results to visualize.")
130
+ return
131
+
132
+ criteria = list(evaluation_results[0]['Scores'].keys())
133
+ num_criteria = len(criteria)
134
+ num_models = len(evaluation_results)
135
+ bar_width = 0.2
136
+
137
+ # Generate a color map for different models
138
+ colors = plt.cm.viridis(np.linspace(0, 1, num_models))
139
+
140
+ # Set up the bar chart
141
+ fig, ax = plt.subplots(figsize=(12, 6))
142
+
143
+ # Generate bars for each model
144
+ for i, result in enumerate(evaluation_results):
145
+ # Extract scores in the order of criteria
146
+ model_scores = [result['Scores'][c] for c in criteria]
147
+
148
+ # Calculate bar positions
149
+ # np.arange(num_criteria) gives [0, 1, 2, ...]
150
+ # bar_width * i shifts the group of bars for the current model
151
+ bar_positions = np.arange(num_criteria) + bar_width * i
152
+
153
+ ax.bar(bar_positions, model_scores, bar_width, label=f'Model {i + 1} - {result.get("model_name", "Unknown")}', color=colors[i])
154
+
155
+ # Set chart labels and title
156
+ ax.set_xlabel('Evaluation Criteria')
157
+ ax.set_ylabel('Scores (1 to 5)')
158
+ ax.set_title('Evaluation Scores for Code Generation Models')
159
+
160
+ # Set X-axis ticks to be centered under the groups of bars
161
+ ax.set_xticks(np.arange(num_criteria) + bar_width * (num_models / 2 - 0.5))
162
+ ax.set_xticklabels(criteria, rotation=45, ha='right')
163
+
164
+ ax.legend()
165
+
166
+ # Display the chart
167
+ plt.tight_layout()
168
+ plt.show()
169
+
170
+ # --- Main Evaluation Loop ---
171
+
172
+ print("Starting LLM Code Generation and Evaluation...")
173
+ results = []
174
+ for model_name in models:
175
+ print("\n" + "=" * 80)
176
+ print(f"Evaluating Model: {model_name}")
177
+ print("=" * 80)
178
+
179
+ # Load the text-generation pipeline for the current model
180
+ # device=-1 indicates using CPU (change to 0 or other for specific GPU)
181
+ generator = pipeline("text-generation", model=model_name, device=-1)
182
+
183
+ # Generate code
184
+ # We pass the problem statement directly as the prompt
185
+ generated_code_output = generator(problem_statement, max_length=200, do_sample=False)
186
+
187
+ # The output is typically a list of dicts: [{'generated_text': '...'}]
188
+ generated_code = generated_code_output[0]['generated_text'].replace(problem_statement, "").strip()
189
+
190
+ print("-" * 30 + " GENERATED CODE " + "-" * 30)
191
+ print(f"\n{generated_code}\n")
192
+ print("-" * 76)
193
+
194
+ # Evaluate code
195
+ evaluation_scores = evaluate_code(problem_statement, generated_code)
196
+
197
+ # Append the result
198
+ if evaluation_scores:
199
+ results.append({
200
+ "model_name": model_name,
201
+ "Scores": evaluation_scores
202
+ })
203
+ else:
204
+ print(f"Skipping model {model_name} due to failed evaluation.")
205
+
206
+ # --- Visualization of Evaluation Results ---
207
+
208
+ print("\n" + "=" * 80)
209
+ print("Final Evaluation Results:")
210
+ print("=" * 80)
211
+
212
+ print(results)
213
+
214
+ # Visualize the scores
215
+ if results:
216
+ visualize_scores(results)
217
+ else:
218
+ print("No valid results to visualize.")