Ahmedik95316 commited on
Commit
c4fbb31
·
verified ·
1 Parent(s): 44dceca

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +65 -5
initialize_system.py CHANGED
@@ -1,10 +1,11 @@
1
  import os
2
  import sys
 
3
  import shutil
4
  import pandas as pd
5
- import json
6
  from pathlib import Path
7
  from datetime import datetime
 
8
 
9
  # Import the new path manager
10
  try:
@@ -274,14 +275,73 @@ def run_initial_training():
274
  ))
275
  ])
276
 
277
- # Train model
278
- log_step("Training model...")
 
 
 
 
 
 
 
 
 
 
279
  pipeline.fit(X_train, y_train)
280
-
281
- # Evaluate
282
  y_pred = pipeline.predict(X_test)
283
  accuracy = accuracy_score(y_test, y_pred)
284
  f1 = f1_score(y_test, y_pred, average='weighted')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  # Ensure model directory exists
287
  model_path.parent.mkdir(parents=True, exist_ok=True)
 
1
  import os
2
  import sys
3
+ import json
4
  import shutil
5
  import pandas as pd
 
6
  from pathlib import Path
7
  from datetime import datetime
8
+ from sklearn.model_selection import cross_validate
9
 
10
  # Import the new path manager
11
  try:
 
275
  ))
276
  ])
277
 
278
+ # Train model with cross-validation
279
+ log_step("Training model with cross-validation...")
280
+
281
+ # Perform cross-validation before final training
282
+ cv_results = cross_validate(
283
+ pipeline, X_train, y_train,
284
+ cv=3,
285
+ scoring=['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'],
286
+ return_train_score=True
287
+ )
288
+
289
+ # Train final model on all training data
290
  pipeline.fit(X_train, y_train)
291
+
292
+ # Evaluate on test set
293
  y_pred = pipeline.predict(X_test)
294
  accuracy = accuracy_score(y_test, y_pred)
295
  f1 = f1_score(y_test, y_pred, average='weighted')
296
+
297
+ # Save CV results for API access
298
+ cv_data = {
299
+ "methodology": {
300
+ "n_splits": 3,
301
+ "cv_type": "StratifiedKFold",
302
+ "random_state": 42
303
+ },
304
+ "test_scores": {
305
+ "accuracy": {
306
+ "mean": float(cv_results['test_accuracy'].mean()),
307
+ "std": float(cv_results['test_accuracy'].std()),
308
+ "scores": cv_results['test_accuracy'].tolist()
309
+ },
310
+ "f1": {
311
+ "mean": float(cv_results['test_f1_weighted'].mean()),
312
+ "std": float(cv_results['test_f1_weighted'].std()),
313
+ "scores": cv_results['test_f1_weighted'].tolist()
314
+ }
315
+ },
316
+ "train_scores": {
317
+ "accuracy": {
318
+ "mean": float(cv_results['train_accuracy'].mean()),
319
+ "std": float(cv_results['train_accuracy'].std()),
320
+ "scores": cv_results['train_accuracy'].tolist()
321
+ },
322
+ "f1": {
323
+ "mean": float(cv_results['train_f1_weighted'].mean()),
324
+ "std": float(cv_results['train_f1_weighted'].std()),
325
+ "scores": cv_results['train_f1_weighted'].tolist()
326
+ }
327
+ }
328
+ }
329
+
330
+ # Calculate quality indicators
331
+ train_acc_mean = cv_data['train_scores']['accuracy']['mean']
332
+ test_acc_mean = cv_data['test_scores']['accuracy']['mean']
333
+ test_acc_std = cv_data['test_scores']['accuracy']['std']
334
+
335
+ cv_data['performance_indicators'] = {
336
+ 'overfitting_score': float(train_acc_mean - test_acc_mean),
337
+ 'stability_score': float(1 - (test_acc_std / test_acc_mean)) if test_acc_mean > 0 else 0
338
+ }
339
+
340
+ # Save CV results to file
341
+ cv_results_path = path_manager.get_logs_path("cv_results.json")
342
+ with open(cv_results_path, 'w') as f:
343
+ json.dump(cv_data, f, indent=2)
344
+ log_step(f"Saved CV results to: {cv_results_path}")
345
 
346
  # Ensure model directory exists
347
  model_path.parent.mkdir(parents=True, exist_ok=True)