Ahmedik95316 commited on
Commit
f984f56
·
verified ·
1 Parent(s): a028318

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +122 -28
initialize_system.py CHANGED
@@ -193,25 +193,51 @@ def create_minimal_dataset():
193
 
194
  def run_initial_training():
195
  """Run basic model training"""
196
- log_step("Running basic training fallback...")
197
-
198
- try:
199
- # Import required libraries for basic training
200
- import pandas as pd
201
- from sklearn.model_selection import train_test_split, cross_validate
202
- from sklearn.feature_extraction.text import TfidfVectorizer
203
- from sklearn.linear_model import LogisticRegression
204
- from sklearn.pipeline import Pipeline
205
- from sklearn.metrics import accuracy_score, f1_score
206
- import joblib
207
- import json
208
- from datetime import datetime
209
 
210
- # Get paths
 
211
  model_path = path_manager.get_model_file_path()
212
  vectorizer_path = path_manager.get_vectorizer_path()
213
  pipeline_path = path_manager.get_pipeline_path()
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # Load dataset
216
  dataset_path = path_manager.get_combined_dataset_path()
217
  if not dataset_path.exists():
@@ -239,7 +265,7 @@ def run_initial_training():
239
  X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
240
  )
241
 
242
- # Create basic pipeline
243
  pipeline = Pipeline([
244
  ('vectorizer', TfidfVectorizer(
245
  max_features=5000,
@@ -256,9 +282,9 @@ def run_initial_training():
256
  ])
257
 
258
  # Train model with cross-validation
259
- log_step("Training basic model with cross-validation...")
260
 
261
- # Perform cross-validation
262
  cv_results = cross_validate(
263
  pipeline, X_train, y_train,
264
  cv=3,
@@ -274,11 +300,63 @@ def run_initial_training():
274
  accuracy = accuracy_score(y_test, y_pred)
275
  f1 = f1_score(y_test, y_pred, average='weighted')
276
 
277
- # Save pipeline
278
- log_step(f"Saving basic pipeline to: {pipeline_path}")
279
- joblib.dump(pipeline, pipeline_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
- # Save individual components for compatibility
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  try:
283
  joblib.dump(pipeline.named_steps['model'], model_path)
284
  joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
@@ -286,29 +364,45 @@ def run_initial_training():
286
  except Exception as e:
287
  log_step(f"⚠️ Failed to save individual components: {e}")
288
 
289
- # Save basic metadata
290
  metadata = {
291
- "model_version": "v1.0_basic_fallback",
292
  "model_type": "logistic_regression_pipeline",
293
  "test_accuracy": float(accuracy),
294
  "test_f1": float(f1),
 
 
295
  "timestamp": datetime.now().isoformat(),
296
- "training_method": "basic_fallback",
297
- "environment": path_manager.environment
 
 
 
 
 
 
 
 
 
298
  }
299
 
300
  metadata_path = path_manager.get_metadata_path()
301
  with open(metadata_path, 'w') as f:
302
  json.dump(metadata, f, indent=2)
303
 
304
- log_step(f"✅ Basic training completed successfully")
305
  log_step(f" Accuracy: {accuracy:.4f}")
306
  log_step(f" F1 Score: {f1:.4f}")
 
 
 
307
 
308
  return True
309
-
310
  except Exception as e:
311
- log_step(f"❌ Basic training fallback also failed: {str(e)}")
 
 
312
  return False
313
 
314
 
 
193
 
194
  def run_initial_training():
195
  """Run basic model training"""
196
+ log_step("Starting initial model training...")
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ try:
199
+ # Get all the paths
200
  model_path = path_manager.get_model_file_path()
201
  vectorizer_path = path_manager.get_vectorizer_path()
202
  pipeline_path = path_manager.get_pipeline_path()
203
 
204
+ log_step(f"Model path: {model_path}")
205
+ log_step(f"Vectorizer path: {vectorizer_path}")
206
+ log_step(f"Pipeline path: {pipeline_path}")
207
+
208
+ # Check if model already exists
209
+ if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
210
+ log_step("✅ Model files already exist, checking if pipeline needs to be created...")
211
+
212
+ # If individual components exist but pipeline doesn't, create pipeline
213
+ if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
214
+ log_step("Creating pipeline from existing components...")
215
+ try:
216
+ import joblib
217
+ from sklearn.pipeline import Pipeline
218
+
219
+ # Load existing components
220
+ model = joblib.load(model_path)
221
+ vectorizer = joblib.load(vectorizer_path)
222
+
223
+ # Create pipeline
224
+ pipeline = Pipeline([
225
+ ('vectorizer', vectorizer),
226
+ ('model', model)
227
+ ])
228
+
229
+ # Save pipeline
230
+ joblib.dump(pipeline, pipeline_path)
231
+ log_step(f"✅ Created pipeline from existing components: {pipeline_path}")
232
+
233
+ except Exception as e:
234
+ log_step(f"⚠️ Failed to create pipeline from existing components: {e}")
235
+
236
+ return True
237
+
238
+ # Import required libraries
239
+
240
+
241
  # Load dataset
242
  dataset_path = path_manager.get_combined_dataset_path()
243
  if not dataset_path.exists():
 
265
  X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
266
  )
267
 
268
+ # Create pipeline with preprocessing
269
  pipeline = Pipeline([
270
  ('vectorizer', TfidfVectorizer(
271
  max_features=5000,
 
282
  ])
283
 
284
  # Train model with cross-validation
285
+ log_step("Training model with cross-validation...")
286
 
287
+ # Perform cross-validation before final training
288
  cv_results = cross_validate(
289
  pipeline, X_train, y_train,
290
  cv=3,
 
300
  accuracy = accuracy_score(y_test, y_pred)
301
  f1 = f1_score(y_test, y_pred, average='weighted')
302
 
303
+ # Save CV results for API access
304
+ cv_data = {
305
+ "n_splits": 3,
306
+ "test_scores": {
307
+ "accuracy": {
308
+ "mean": float(cv_results['test_accuracy'].mean()),
309
+ "std": float(cv_results['test_accuracy'].std()),
310
+ "scores": cv_results['test_accuracy'].tolist()
311
+ },
312
+ "f1": {
313
+ "mean": float(cv_results['test_f1_weighted'].mean()),
314
+ "std": float(cv_results['test_f1_weighted'].std()),
315
+ "scores": cv_results['test_f1_weighted'].tolist()
316
+ }
317
+ },
318
+ "train_scores": {
319
+ "accuracy": {
320
+ "mean": float(cv_results['train_accuracy'].mean()),
321
+ "std": float(cv_results['train_accuracy'].std()),
322
+ "scores": cv_results['train_accuracy'].tolist()
323
+ },
324
+ "f1": {
325
+ "mean": float(cv_results['train_f1_weighted'].mean()),
326
+ "std": float(cv_results['train_f1_weighted'].std()),
327
+ "scores": cv_results['train_f1_weighted'].tolist()
328
+ }
329
+ }
330
+ }
331
 
332
+ # Save CV results to file
333
+ cv_results_path = path_manager.get_logs_path("cv_results.json")
334
+ with open(cv_results_path, 'w') as f:
335
+ json.dump(cv_data, f, indent=2)
336
+ log_step(f"Saved CV results to: {cv_results_path}")
337
+
338
+ # Ensure model directory exists
339
+ model_path.parent.mkdir(parents=True, exist_ok=True)
340
+
341
+ # Save complete pipeline FIRST (this is the priority)
342
+ log_step(f"Saving pipeline to: {pipeline_path}")
343
+ joblib.dump(pipeline, pipeline_path)
344
+
345
+ # Verify pipeline was saved
346
+ if pipeline_path.exists():
347
+ log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
348
+
349
+ # Test loading the pipeline
350
+ try:
351
+ test_pipeline = joblib.load(pipeline_path)
352
+ test_pred = test_pipeline.predict(["This is a test"])
353
+ log_step(f"✅ Pipeline verification successful: {test_pred}")
354
+ except Exception as e:
355
+ log_step(f"⚠️ Pipeline verification failed: {e}")
356
+ else:
357
+ log_step(f"❌ Pipeline was not saved to {pipeline_path}")
358
+
359
+ # Save individual components for backward compatibility
360
  try:
361
  joblib.dump(pipeline.named_steps['model'], model_path)
362
  joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
 
364
  except Exception as e:
365
  log_step(f"⚠️ Failed to save individual components: {e}")
366
 
367
+ # Save metadata
368
  metadata = {
369
+ "model_version": "v1.0_init",
370
  "model_type": "logistic_regression_pipeline",
371
  "test_accuracy": float(accuracy),
372
  "test_f1": float(f1),
373
+ "train_size": len(X_train),
374
+ "test_size": len(X_test),
375
  "timestamp": datetime.now().isoformat(),
376
+ "training_method": "initialization",
377
+ "environment": path_manager.environment,
378
+ "data_path": str(dataset_path),
379
+ "class_distribution": class_counts.to_dict(),
380
+ "pipeline_created": pipeline_path.exists(),
381
+ "individual_components_created": model_path.exists() and vectorizer_path.exists(),
382
+ # Add CV results to metadata
383
+ "cv_f1_mean": float(cv_results['test_f1_weighted'].mean()),
384
+ "cv_f1_std": float(cv_results['test_f1_weighted'].std()),
385
+ "cv_accuracy_mean": float(cv_results['test_accuracy'].mean()),
386
+ "cv_accuracy_std": float(cv_results['test_accuracy'].std())
387
  }
388
 
389
  metadata_path = path_manager.get_metadata_path()
390
  with open(metadata_path, 'w') as f:
391
  json.dump(metadata, f, indent=2)
392
 
393
+ log_step(f"✅ Training completed successfully")
394
  log_step(f" Accuracy: {accuracy:.4f}")
395
  log_step(f" F1 Score: {f1:.4f}")
396
+ log_step(f" Pipeline saved: {pipeline_path.exists()}")
397
+ log_step(f" Model saved to: {model_path}")
398
+ log_step(f" Vectorizer saved to: {vectorizer_path}")
399
 
400
  return True
401
+
402
  except Exception as e:
403
+ log_step(f"❌ Training failed: {str(e)}")
404
+ import traceback
405
+ log_step(f"❌ Traceback: {traceback.format_exc()}")
406
  return False
407
 
408