Ahmedik95316 commited on
Commit
4798f78
·
1 Parent(s): 0908ace

Update model/retrain.py

Browse files

Adding LightGBM for Ensemble Model

Files changed (1) hide show
  1. model/retrain.py +587 -623
model/retrain.py CHANGED
@@ -1,10 +1,8 @@
1
- # Enhanced version with comprehensive cross-validation and advanced feature engineering
2
-
3
 
4
  import json
5
  import shutil
6
  import joblib
7
- import asyncio
8
  import logging
9
  import hashlib
10
  import schedule
@@ -27,15 +25,18 @@ from sklearn.metrics import (
27
  roc_auc_score, confusion_matrix, classification_report
28
  )
29
  from sklearn.model_selection import (
30
- cross_val_score, StratifiedKFold, cross_validate, train_test_split
31
  )
32
  from sklearn.feature_extraction.text import TfidfVectorizer
33
  from sklearn.linear_model import LogisticRegression
34
- from sklearn.ensemble import RandomForestClassifier
35
  from sklearn.pipeline import Pipeline
36
  from sklearn.preprocessing import FunctionTransformer
37
  from sklearn.feature_selection import SelectKBest, chi2
38
 
 
 
 
39
  # Import enhanced feature engineering components
40
  try:
41
  from features.feature_engineer import AdvancedFeatureEngineer, create_enhanced_pipeline, analyze_feature_importance
@@ -433,74 +434,141 @@ class CVModelComparator:
433
  except Exception as e:
434
  logger.error(f"Metric comparison failed for {metric}: {e}")
435
  return {'metric': metric, 'error': str(e)}
 
 
 
 
436
 
437
- def _calculate_decision_confidence(self, comparison_results: Dict) -> float:
438
- """Calculate confidence in the promotion decision with enhanced features consideration"""
439
 
440
- try:
441
- confidence_factors = []
442
-
443
- # Check F1 improvement significance
444
- f1_comp = comparison_results['metric_comparisons'].get('f1', {})
445
- if f1_comp.get('significant_improvement', False):
446
- confidence_factors.append(0.4)
447
- elif f1_comp.get('improvement', 0) > 0.01:
448
- confidence_factors.append(0.2)
449
-
450
- # Feature engineering upgrade bonus
451
- feature_upgrade = comparison_results.get('feature_engineering_comparison', {}).get('feature_upgrade', {})
452
- if feature_upgrade.get('is_upgrade', False):
453
- confidence_factors.append(0.3) # Significant bonus for feature upgrades
454
-
455
- # Check consistency across metrics
456
- improved_metrics = 0
457
- total_metrics = 0
458
- for metric_comp in comparison_results['metric_comparisons'].values():
459
- if isinstance(metric_comp, dict) and 'improvement' in metric_comp:
460
- total_metrics += 1
461
- if metric_comp['improvement'] > 0:
462
- improved_metrics += 1
463
-
464
- if total_metrics > 0:
465
- consistency_score = improved_metrics / total_metrics
466
- confidence_factors.append(consistency_score * 0.3)
467
-
468
- # Check effect sizes
469
- effect_sizes = []
470
- for metric_comp in comparison_results['metric_comparisons'].values():
471
- if isinstance(metric_comp, dict) and 'effect_size' in metric_comp:
472
- effect_sizes.append(abs(metric_comp['effect_size']))
473
-
474
- if effect_sizes:
475
- avg_effect_size = np.mean(effect_sizes)
476
- if avg_effect_size > 0.5: # Large effect
477
- confidence_factors.append(0.2)
478
- elif avg_effect_size > 0.2: # Medium effect
479
- confidence_factors.append(0.1)
480
-
481
- # Calculate final confidence
482
- total_confidence = sum(confidence_factors)
483
- return min(1.0, max(0.0, total_confidence))
484
 
485
- except Exception as e:
486
- logger.warning(f"Confidence calculation failed: {e}")
487
- return 0.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
489
 
490
  class EnhancedModelRetrainer:
491
- """Production-ready model retraining with enhanced feature engineering and comprehensive CV"""
492
 
493
  def __init__(self):
494
  self.setup_paths()
495
  self.setup_retraining_config()
496
  self.setup_statistical_tests()
 
497
  self.cv_comparator = CVModelComparator()
 
498
 
499
  # Enhanced feature engineering settings
500
  self.enhanced_features_available = ENHANCED_FEATURES_AVAILABLE
501
  self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE # Default to enhanced if available
 
502
 
503
- logger.info(f"Enhanced retraining initialized with features: {'enhanced' if self.use_enhanced_features else 'standard'}")
504
 
505
  def setup_paths(self):
506
  """Setup all necessary paths"""
@@ -549,18 +617,19 @@ class EnhancedModelRetrainer:
549
  self.max_retries = 3
550
  self.backup_retention_days = 30
551
 
552
- # Enhanced feature configuration
553
- self.enhanced_feature_config = {
554
- 'enable_sentiment': True,
555
- 'enable_readability': True,
556
- 'enable_entities': True,
557
- 'enable_linguistic': True,
558
- 'feature_selection_k': 3000,
559
- 'tfidf_max_features': 7500,
560
- 'ngram_range': (1, 3),
561
- 'min_df': 2,
562
- 'max_df': 0.95
563
- }
 
564
 
565
  def setup_statistical_tests(self):
566
  """Setup statistical test configurations"""
@@ -570,6 +639,54 @@ class EnhancedModelRetrainer:
570
  'mcnemar': {'alpha': 0.05, 'name': "McNemar's Test"}
571
  }
572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  def detect_production_feature_type(self) -> str:
574
  """Detect what type of features the production model uses"""
575
  try:
@@ -717,36 +834,32 @@ class EnhancedModelRetrainer:
717
  logger.info(f"Data cleaning: {initial_count} -> {len(df)} samples")
718
  return df
719
 
720
- def create_enhanced_pipeline(self, use_enhanced_features: bool = None) -> Pipeline:
721
- """Create enhanced ML pipeline with feature engineering"""
722
 
723
- if use_enhanced_features is None:
724
- use_enhanced_features = self.use_enhanced_features
725
 
726
- if use_enhanced_features and ENHANCED_FEATURES_AVAILABLE:
727
  logger.info("Creating enhanced feature engineering pipeline for retraining...")
728
 
729
- # Create enhanced feature engineer with configuration
730
  feature_engineer = AdvancedFeatureEngineer(
731
- enable_sentiment=self.enhanced_feature_config['enable_sentiment'],
732
- enable_readability=self.enhanced_feature_config['enable_readability'],
733
- enable_entities=self.enhanced_feature_config['enable_entities'],
734
- enable_linguistic=self.enhanced_feature_config['enable_linguistic'],
735
- feature_selection_k=self.enhanced_feature_config['feature_selection_k'],
736
- tfidf_max_features=self.enhanced_feature_config['tfidf_max_features'],
737
- ngram_range=self.enhanced_feature_config['ngram_range'],
738
- min_df=self.enhanced_feature_config['min_df'],
739
- max_df=self.enhanced_feature_config['max_df']
740
  )
741
 
742
  # Create pipeline with enhanced features
743
  pipeline = Pipeline([
744
  ('enhanced_features', feature_engineer),
745
- ('model', LogisticRegression(
746
- max_iter=1000,
747
- class_weight='balanced',
748
- random_state=self.random_state
749
- ))
750
  ])
751
 
752
  return pipeline
@@ -754,34 +867,279 @@ class EnhancedModelRetrainer:
754
  else:
755
  logger.info("Creating standard TF-IDF pipeline for retraining...")
756
 
757
- def preprocess_text(texts):
758
- return preprocess_text_function(texts)
759
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  # Create standard pipeline
761
  pipeline = Pipeline([
762
- ('preprocess', FunctionTransformer(preprocess_text, validate=False)),
763
- ('vectorize', TfidfVectorizer(
764
- max_features=10000,
765
- min_df=2,
766
- max_df=0.95,
767
- ngram_range=(1, 3),
768
- stop_words='english',
769
- sublinear_tf=True
770
- )),
771
- ('feature_select', SelectKBest(chi2, k=5000)),
772
- ('model', LogisticRegression(
773
- max_iter=1000,
774
- class_weight='balanced',
775
- random_state=self.random_state
776
- ))
777
  ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
 
779
- return pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
  def train_candidate_model(self, df: pd.DataFrame) -> Tuple[bool, Optional[Any], Dict]:
782
  """Train candidate model with enhanced features and comprehensive CV evaluation"""
783
  try:
784
- logger.info("Training candidate model with enhanced feature engineering...")
785
 
786
  # Prepare data
787
  X = df['text'].values
@@ -793,40 +1151,50 @@ class EnhancedModelRetrainer:
793
 
794
  logger.info(f"Training candidate with {candidate_feature_type} features (production uses {prod_feature_type})")
795
 
796
- # Create and train pipeline
797
- pipeline = self.create_enhanced_pipeline(self.use_enhanced_features)
798
-
799
- # Perform cross-validation before final training
800
- logger.info("Performing cross-validation on candidate model...")
801
- cv_results = self.cv_comparator.perform_model_cv_evaluation(pipeline, X, y)
802
-
803
- # Train on full dataset for final model
804
- pipeline.fit(X, y)
805
-
806
  # Additional holdout evaluation
807
  X_train, X_test, y_train, y_test = train_test_split(
808
  X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
809
  )
810
 
811
- pipeline_holdout = self.create_enhanced_pipeline(self.use_enhanced_features)
812
- pipeline_holdout.fit(X_train, y_train)
813
-
814
- # Evaluate on holdout
815
- y_pred = pipeline_holdout.predict(X_test)
816
- y_pred_proba = pipeline_holdout.predict_proba(X_test)[:, 1]
817
-
818
- holdout_metrics = {
819
- 'accuracy': float(accuracy_score(y_test, y_pred)),
820
- 'precision': float(precision_score(y_test, y_pred, average='weighted')),
821
- 'recall': float(recall_score(y_test, y_pred, average='weighted')),
822
- 'f1': float(f1_score(y_test, y_pred, average='weighted')),
823
- 'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
824
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
825
 
826
  # Extract feature information if using enhanced features
827
  feature_analysis = {}
828
- if self.use_enhanced_features and hasattr(pipeline, 'named_steps'):
829
- feature_engineer = pipeline.named_steps.get('enhanced_features')
830
  if feature_engineer and hasattr(feature_engineer, 'get_feature_metadata'):
831
  try:
832
  feature_analysis = {
@@ -834,29 +1202,36 @@ class EnhancedModelRetrainer:
834
  'feature_importance': feature_engineer.get_feature_importance(top_k=20) if hasattr(feature_engineer, 'get_feature_importance') else {},
835
  'total_features': len(feature_engineer.get_feature_names()) if hasattr(feature_engineer, 'get_feature_names') else 0
836
  }
837
- logger.info(f"Enhanced features extracted: {feature_analysis['total_features']} total features")
838
  except Exception as e:
839
  logger.warning(f"Could not extract feature analysis: {e}")
840
 
841
- # Combine CV and holdout results
 
 
 
842
  evaluation_results = {
843
  'cross_validation': cv_results,
844
- 'holdout_evaluation': holdout_metrics,
845
  'feature_analysis': feature_analysis,
846
  'feature_type': candidate_feature_type,
847
  'training_samples': len(X),
848
- 'test_samples': len(X_test)
 
 
 
 
 
849
  }
850
 
851
  # Save candidate model
852
- joblib.dump(pipeline, self.candidate_pipeline_path)
853
- if hasattr(pipeline, 'named_steps'):
854
- if 'model' in pipeline.named_steps:
855
- joblib.dump(pipeline.named_steps['model'], self.candidate_model_path)
856
 
857
  # Save enhanced features or vectorizer
858
- if 'enhanced_features' in pipeline.named_steps:
859
- feature_engineer = pipeline.named_steps['enhanced_features']
860
  if hasattr(feature_engineer, 'save_pipeline'):
861
  feature_engineer.save_pipeline(self.candidate_feature_engineer_path)
862
 
@@ -868,19 +1243,24 @@ class EnhancedModelRetrainer:
868
  }
869
  joblib.dump(enhanced_ref, self.candidate_vectorizer_path)
870
 
871
- elif 'vectorize' in pipeline.named_steps:
872
- joblib.dump(pipeline.named_steps['vectorize'], self.candidate_vectorizer_path)
 
 
 
 
 
 
873
 
874
  # Log results
875
  if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
876
  cv_f1_mean = cv_results['test_scores']['f1']['mean']
877
  cv_f1_std = cv_results['test_scores']['f1']['std']
878
- logger.info(f"Candidate model CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
879
 
880
- logger.info(f"Candidate model holdout F1: {holdout_metrics['f1']:.4f}")
881
  logger.info(f"Candidate model training completed with {candidate_feature_type} features")
882
 
883
- return True, pipeline, evaluation_results
884
 
885
  except Exception as e:
886
  error_msg = f"Candidate model training failed: {str(e)}"
@@ -1013,18 +1393,19 @@ class EnhancedModelRetrainer:
1013
 
1014
  # Extract metrics from candidate evaluation
1015
  cv_results = candidate_metrics.get('cross_validation', {})
1016
- holdout_results = candidate_metrics.get('holdout_evaluation', {})
1017
  feature_analysis = candidate_metrics.get('feature_analysis', {})
 
1018
 
1019
  # Update metadata with comprehensive information
1020
  metadata.update({
1021
  'model_version': new_version,
1022
- 'model_type': 'enhanced_retrained_pipeline_cv',
1023
  'previous_version': old_version,
1024
  'promotion_timestamp': datetime.now().isoformat(),
1025
- 'retrain_trigger': 'enhanced_cv_validated_retrain',
1026
  'training_samples': candidate_metrics.get('training_samples', 'Unknown'),
1027
- 'test_samples': candidate_metrics.get('test_samples', 'Unknown')
 
1028
  })
1029
 
1030
  # Enhanced feature engineering metadata
@@ -1068,16 +1449,6 @@ class EnhancedModelRetrainer:
1068
  except Exception as e:
1069
  logger.warning(f"Could not save feature analysis: {e}")
1070
 
1071
- # Add holdout evaluation results
1072
- if holdout_results:
1073
- metadata.update({
1074
- 'test_accuracy': holdout_results.get('accuracy', 'Unknown'),
1075
- 'test_f1': holdout_results.get('f1', 'Unknown'),
1076
- 'test_precision': holdout_results.get('precision', 'Unknown'),
1077
- 'test_recall': holdout_results.get('recall', 'Unknown'),
1078
- 'test_roc_auc': holdout_results.get('roc_auc', 'Unknown')
1079
- })
1080
-
1081
  # Add comprehensive CV results
1082
  if cv_results and 'test_scores' in cv_results:
1083
  metadata['cross_validation'] = {
@@ -1096,7 +1467,9 @@ class EnhancedModelRetrainer:
1096
  'cv_f1_mean': cv_results['test_scores']['f1']['mean'],
1097
  'cv_f1_std': cv_results['test_scores']['f1']['std'],
1098
  'cv_f1_min': cv_results['test_scores']['f1']['min'],
1099
- 'cv_f1_max': cv_results['test_scores']['f1']['max']
 
 
1100
  })
1101
 
1102
  # Add enhanced model comparison results
@@ -1104,7 +1477,7 @@ class EnhancedModelRetrainer:
1104
  metadata['promotion_validation'] = {
1105
  'decision_confidence': promotion_decision.get('confidence', 'Unknown'),
1106
  'promotion_reason': promotion_decision.get('reason', 'Unknown'),
1107
- 'comparison_method': 'enhanced_cv_statistical_tests',
1108
  'feature_engineering_factor': promotion_decision.get('feature_engineering_factor', False),
1109
  'feature_upgrade_details': promotion_decision.get('feature_upgrade_details', {})
1110
  }
@@ -1122,6 +1495,11 @@ class EnhancedModelRetrainer:
1122
  'tests': comparison.get('tests', {})
1123
  }
1124
 
 
 
 
 
 
1125
  # Save updated metadata
1126
  with open(self.metadata_path, 'w') as f:
1127
  json.dump(metadata, f, indent=2)
@@ -1132,7 +1510,8 @@ class EnhancedModelRetrainer:
1132
  total_features = feature_analysis.get('total_features', 0)
1133
  feature_info = f" with {total_features} enhanced features"
1134
 
1135
- logger.info(f"Model promoted successfully to {new_version}{feature_info}")
 
1136
  logger.info(f"Promotion reason: {promotion_decision.get('reason', 'Enhanced CV validation passed')}")
1137
 
1138
  return True
@@ -1148,9 +1527,10 @@ class EnhancedModelRetrainer:
1148
  'timestamp': datetime.now().isoformat(),
1149
  'results': results,
1150
  'session_id': hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8],
1151
- 'retraining_type': 'enhanced_cv_features',
1152
  'enhanced_features_used': self.use_enhanced_features,
1153
- 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE
 
1154
  }
1155
 
1156
  # Load existing logs
@@ -1190,7 +1570,8 @@ class EnhancedModelRetrainer:
1190
  'enhanced_features_info': {
1191
  'used': self.use_enhanced_features,
1192
  'available': ENHANCED_FEATURES_AVAILABLE,
1193
- 'feature_comparison': results['comparison_results'].get('feature_engineering_comparison', {})
 
1194
  }
1195
  }
1196
 
@@ -1205,9 +1586,9 @@ class EnhancedModelRetrainer:
1205
  logger.error(f"Failed to log enhanced retraining session: {str(e)}")
1206
 
1207
  def retrain_model(self) -> Tuple[bool, str]:
1208
- """Main retraining function with enhanced feature engineering and comprehensive CV validation"""
1209
  try:
1210
- logger.info("Starting enhanced model retraining with advanced feature engineering...")
1211
 
1212
  # Load existing metadata
1213
  existing_metadata = self.load_existing_metadata()
@@ -1218,7 +1599,7 @@ class EnhancedModelRetrainer:
1218
  logger.warning(f"No production model found: {prod_msg}")
1219
  # Fall back to initial training
1220
  try:
1221
- from model.train import main as train_main
1222
  train_main()
1223
  return True, "Initial enhanced training completed"
1224
  except ImportError:
@@ -1238,8 +1619,10 @@ class EnhancedModelRetrainer:
1238
  candidate_feature_type = 'enhanced' if self.use_enhanced_features else 'standard'
1239
 
1240
  logger.info(f"Retraining strategy: {prod_feature_type} -> {candidate_feature_type}")
 
 
1241
 
1242
- # Train candidate model with enhanced features
1243
  candidate_success, candidate_model, candidate_metrics = self.train_candidate_model(df)
1244
  if not candidate_success:
1245
  return False, f"Enhanced candidate training failed: {candidate_metrics.get('error', 'Unknown error')}"
@@ -1259,12 +1642,15 @@ class EnhancedModelRetrainer:
1259
  'comparison_results': comparison_results,
1260
  'data_size': len(df),
1261
  'cv_folds': self.cv_folds,
1262
- 'retraining_method': 'enhanced_cv_features',
1263
  'feature_engineering': {
1264
  'production_type': prod_feature_type,
1265
  'candidate_type': candidate_feature_type,
1266
  'feature_upgrade': comparison_results.get('feature_engineering_comparison', {})
1267
- }
 
 
 
1268
  }
1269
 
1270
  self.log_retraining_session(session_results)
@@ -1285,6 +1671,7 @@ class EnhancedModelRetrainer:
1285
  improvement = f1_comp.get('improvement', 0)
1286
  confidence = promotion_decision.get('confidence', 0)
1287
  feature_upgrade = promotion_decision.get('feature_engineering_factor', False)
 
1288
 
1289
  feature_info = ""
1290
  if feature_upgrade:
@@ -1292,8 +1679,12 @@ class EnhancedModelRetrainer:
1292
  elif candidate_feature_type == 'enhanced':
1293
  feature_info = " using enhanced features"
1294
 
 
 
 
 
1295
  success_msg = (
1296
- f"Enhanced model promoted successfully{feature_info}! "
1297
  f"F1 improvement: {improvement:.4f}, "
1298
  f"Confidence: {confidence:.2f}, "
1299
  f"Reason: {promotion_decision.get('reason', 'Enhanced CV validation passed')}"
@@ -1306,9 +1697,11 @@ class EnhancedModelRetrainer:
1306
  # Keep current model
1307
  reason = promotion_decision.get('reason', 'No significant improvement detected')
1308
  confidence = promotion_decision.get('confidence', 0)
 
1309
 
1310
  keep_msg = (
1311
  f"Keeping current model based on enhanced CV analysis. "
 
1312
  f"Reason: {reason}, "
1313
  f"Confidence: {confidence:.2f}"
1314
  )
@@ -1340,487 +1733,57 @@ class EnhancedModelRetrainer:
1340
  return False, f"Automated enhanced retraining failed: {str(e)}"
1341
 
1342
 
 
1343
  class AutomatedRetrainingManager:
1344
  """Manages automated retraining triggers and scheduling with enhanced features"""
1345
 
1346
  def __init__(self, base_dir: Path = None):
1347
  self.base_dir = base_dir or Path("/tmp")
1348
  self.setup_automation_paths()
1349
- self.setup_automation_config()
1350
  self.drift_monitor = AdvancedDriftMonitor()
1351
  self.retraining_active = False
1352
- self.automation_thread = None
1353
- self.last_check_time = None
1354
-
1355
- # Enhanced feature settings
1356
  self.enhanced_features_available = ENHANCED_FEATURES_AVAILABLE
1357
 
1358
- self.automation_status = {
1359
- 'enabled': True,
1360
- 'last_automated_training': None,
1361
- 'total_automated_trainings': 0,
1362
- 'failed_attempts': 0,
1363
- 'enhanced_features_used': self.enhanced_features_available
1364
- }
1365
-
1366
  logger.info(f"Automated retraining manager initialized with enhanced features: {self.enhanced_features_available}")
1367
 
1368
  def setup_automation_paths(self):
1369
  """Setup automation-specific paths"""
1370
  self.automation_dir = self.base_dir / "automation"
1371
  self.automation_dir.mkdir(parents=True, exist_ok=True)
1372
-
1373
  self.automation_log_path = self.automation_dir / "automation_log.json"
1374
- self.retraining_queue_path = self.automation_dir / "retraining_queue.json"
1375
- self.automation_config_path = self.automation_dir / "automation_config.json"
1376
-
1377
- def setup_automation_config(self):
1378
- """Setup automation configuration with enhanced feature considerations"""
1379
- self.automation_config = {
1380
- 'monitoring_schedule': {
1381
- 'check_interval_minutes': 360, # 6 hours
1382
- 'force_check_interval_hours': 24,
1383
- 'max_daily_retrainings': 3,
1384
- 'cooldown_hours_after_training': 6
1385
- },
1386
- 'retraining_conditions': {
1387
- 'require_data_quality_check': True,
1388
- 'min_time_between_trainings': timedelta(hours=6),
1389
- 'max_consecutive_failures': 3,
1390
- 'emergency_override': True,
1391
- 'prefer_enhanced_features': True # New setting
1392
- },
1393
- 'notification_settings': {
1394
- 'notify_on_trigger': True,
1395
- 'notify_on_completion': True,
1396
- 'notify_on_failure': True,
1397
- 'notify_on_feature_upgrade': True # New setting
1398
- }
1399
- }
1400
-
1401
- self.load_automation_config()
1402
-
1403
- def load_automation_config(self):
1404
- """Load automation configuration from file"""
1405
- try:
1406
- if self.automation_config_path.exists():
1407
- with open(self.automation_config_path, 'r') as f:
1408
- saved_config = json.load(f)
1409
-
1410
- # Update with saved settings
1411
- self.automation_config.update(saved_config)
1412
- logger.info("Loaded enhanced automation configuration")
1413
- except Exception as e:
1414
- logger.warning(f"Failed to load automation config: {e}")
1415
-
1416
- def save_automation_config(self):
1417
- """Save automation configuration to file"""
1418
- try:
1419
- with open(self.automation_config_path, 'w') as f:
1420
- # Convert timedelta objects to strings for JSON serialization
1421
- config_to_save = json.loads(json.dumps(self.automation_config, default=str))
1422
- json.dump(config_to_save, f, indent=2)
1423
- except Exception as e:
1424
- logger.error(f"Failed to save automation config: {e}")
1425
-
1426
- def start_automated_monitoring(self):
1427
- """Start the automated monitoring and retraining system with enhanced features"""
1428
- if self.retraining_active:
1429
- logger.warning("Automated monitoring already active")
1430
- return
1431
-
1432
- self.retraining_active = True
1433
-
1434
- # Schedule periodic checks
1435
- check_interval = self.automation_config['monitoring_schedule']['check_interval_minutes']
1436
- schedule.every(check_interval).minutes.do(self.perform_scheduled_check)
1437
-
1438
- # Schedule daily forced check
1439
- schedule.every().day.at("02:00").do(self.perform_forced_check)
1440
-
1441
- # Start background thread
1442
- self.automation_thread = threading.Thread(target=self.automation_loop, daemon=True)
1443
- self.automation_thread.start()
1444
-
1445
- logger.info("Enhanced automated retraining monitoring started")
1446
- self.log_automation_event("monitoring_started", "Enhanced automated monitoring system started")
1447
-
1448
- def stop_automated_monitoring(self):
1449
- """Stop the automated monitoring system"""
1450
- self.retraining_active = False
1451
- schedule.clear()
1452
-
1453
- if self.automation_thread:
1454
- self.automation_thread.join(timeout=10)
1455
-
1456
- logger.info("Enhanced automated retraining monitoring stopped")
1457
- self.log_automation_event("monitoring_stopped", "Enhanced automated monitoring system stopped")
1458
-
1459
- def automation_loop(self):
1460
- """Main automation loop"""
1461
- while self.retraining_active:
1462
- try:
1463
- schedule.run_pending()
1464
- time_module.sleep(60) # Check every minute
1465
- except Exception as e:
1466
- logger.error(f"Enhanced automation loop error: {e}")
1467
- time_module.sleep(300) # Wait 5 minutes on error
1468
-
1469
- def perform_scheduled_check(self):
1470
- """Perform scheduled retraining trigger check"""
1471
- try:
1472
- logger.info("Performing scheduled enhanced retraining trigger check")
1473
-
1474
- # Check if we're in cooldown period
1475
- if self.is_in_cooldown_period():
1476
- logger.info("Skipping check - in cooldown period after recent training")
1477
- return
1478
-
1479
- # Check daily limit
1480
- if self.exceeded_daily_retraining_limit():
1481
- logger.info("Skipping check - daily retraining limit exceeded")
1482
- return
1483
-
1484
- # Perform trigger evaluation
1485
- trigger_results = self.drift_monitor.check_retraining_triggers()
1486
-
1487
- self.last_check_time = datetime.now()
1488
-
1489
- # Log the check
1490
- self.log_automation_event("scheduled_check", "Performed scheduled enhanced trigger check", {
1491
- 'trigger_results': trigger_results,
1492
- 'should_retrain': trigger_results.get('should_retrain', False),
1493
- 'enhanced_features_available': self.enhanced_features_available
1494
- })
1495
-
1496
- # Trigger retraining if needed
1497
- if trigger_results.get('should_retrain', False):
1498
- self.queue_retraining(trigger_results)
1499
-
1500
- except Exception as e:
1501
- logger.error(f"Scheduled enhanced check failed: {e}")
1502
- self.log_automation_event("check_failed", f"Scheduled enhanced check failed: {str(e)}")
1503
-
1504
- def perform_forced_check(self):
1505
- """Perform forced daily check regardless of other conditions"""
1506
- try:
1507
- logger.info("Performing forced daily enhanced retraining check")
1508
-
1509
- # Always perform drift monitoring
1510
- trigger_results = self.drift_monitor.check_retraining_triggers()
1511
-
1512
- self.log_automation_event("forced_check", "Performed forced daily enhanced check", {
1513
- 'trigger_results': trigger_results,
1514
- 'enhanced_features_available': self.enhanced_features_available
1515
- })
1516
-
1517
- # Only trigger retraining for urgent cases during forced check
1518
- if trigger_results.get('urgency') in ['critical', 'high']:
1519
- self.queue_retraining(trigger_results, forced=True)
1520
-
1521
- except Exception as e:
1522
- logger.error(f"Forced enhanced check failed: {e}")
1523
- self.log_automation_event("forced_check_failed", f"Forced enhanced check failed: {str(e)}")
1524
-
1525
- def queue_retraining(self, trigger_results: Dict, forced: bool = False):
1526
- """Queue a retraining job with enhanced feature considerations"""
1527
- try:
1528
- retraining_job = {
1529
- 'queued_at': datetime.now().isoformat(),
1530
- 'trigger_results': trigger_results,
1531
- 'urgency': trigger_results.get('urgency', 'medium'),
1532
- 'forced': forced,
1533
- 'status': 'queued',
1534
- 'attempts': 0,
1535
- 'enhanced_features_available': self.enhanced_features_available,
1536
- 'prefer_enhanced_features': self.automation_config['retraining_conditions'].get('prefer_enhanced_features', True)
1537
- }
1538
-
1539
- # Load existing queue
1540
- queue = self.load_retraining_queue()
1541
- queue.append(retraining_job)
1542
-
1543
- # Save queue
1544
- self.save_retraining_queue(queue)
1545
-
1546
- logger.info(f"Enhanced retraining queued with urgency: {trigger_results.get('urgency', 'medium')}")
1547
- self.log_automation_event("retraining_queued", "Enhanced retraining job queued", retraining_job)
1548
-
1549
- # Execute immediately for critical cases
1550
- if trigger_results.get('urgency') == 'critical' or forced:
1551
- self.execute_queued_retraining()
1552
-
1553
- except Exception as e:
1554
- logger.error(f"Failed to queue enhanced retraining: {e}")
1555
- self.log_automation_event("queue_failed", f"Failed to queue enhanced retraining: {str(e)}")
1556
-
1557
- def execute_queued_retraining(self):
1558
- """Execute queued retraining jobs with enhanced features"""
1559
- try:
1560
- queue = self.load_retraining_queue()
1561
-
1562
- # Sort by urgency (critical > high > medium > low)
1563
- urgency_order = {'critical': 0, 'high': 1, 'medium': 2, 'low': 3}
1564
- queue.sort(key=lambda x: urgency_order.get(x.get('urgency', 'medium'), 2))
1565
-
1566
- executed_jobs = []
1567
-
1568
- for job in queue:
1569
- if job['status'] == 'queued':
1570
- success = self.execute_single_retraining(job)
1571
-
1572
- if success:
1573
- job['status'] = 'completed'
1574
- job['completed_at'] = datetime.now().isoformat()
1575
- self.automation_status['last_automated_training'] = datetime.now().isoformat()
1576
- self.automation_status['total_automated_trainings'] += 1
1577
- executed_jobs.append(job)
1578
- break # Only execute one job at a time
1579
- else:
1580
- job['attempts'] += 1
1581
- if job['attempts'] >= 3:
1582
- job['status'] = 'failed'
1583
- job['failed_at'] = datetime.now().isoformat()
1584
- self.automation_status['failed_attempts'] += 1
1585
-
1586
- # Update queue
1587
- remaining_queue = [job for job in queue if job['status'] == 'queued']
1588
- self.save_retraining_queue(remaining_queue)
1589
-
1590
- return len(executed_jobs) > 0
1591
-
1592
- except Exception as e:
1593
- logger.error(f"Failed to execute queued enhanced retraining: {e}")
1594
- return False
1595
-
1596
- def execute_single_retraining(self, job: Dict) -> bool:
1597
- """Execute a single retraining job with enhanced features"""
1598
- try:
1599
- logger.info(f"Starting automated enhanced retraining with urgency: {job.get('urgency', 'medium')}")
1600
-
1601
- job['started_at'] = datetime.now().isoformat()
1602
- job['status'] = 'running'
1603
-
1604
- # Create enhanced retraining manager
1605
- retrainer = EnhancedModelRetrainer()
1606
-
1607
- # Configure enhanced features based on job preferences
1608
- prefer_enhanced = job.get('prefer_enhanced_features', True)
1609
- retrainer.use_enhanced_features = prefer_enhanced and ENHANCED_FEATURES_AVAILABLE
1610
-
1611
- # Perform enhanced retraining with validation
1612
- success, result = retrainer.automated_retrain_with_validation()
1613
-
1614
- if success:
1615
- logger.info("Automated enhanced retraining completed successfully")
1616
- self.log_automation_event("retraining_success", "Automated enhanced retraining completed", {
1617
- 'job': job,
1618
- 'result': result,
1619
- 'enhanced_features_used': retrainer.use_enhanced_features
1620
- })
1621
- return True
1622
- else:
1623
- logger.error(f"Automated enhanced retraining failed: {result}")
1624
- self.log_automation_event("retraining_failed", f"Automated enhanced retraining failed: {result}", {
1625
- 'job': job,
1626
- 'enhanced_features_used': retrainer.use_enhanced_features
1627
- })
1628
- return False
1629
-
1630
- except Exception as e:
1631
- logger.error(f"Enhanced retraining execution failed: {e}")
1632
- self.log_automation_event("retraining_error", f"Enhanced retraining execution error: {str(e)}", {'job': job})
1633
- return False
1634
-
1635
- def is_in_cooldown_period(self) -> bool:
1636
- """Check if we're in cooldown period after recent training"""
1637
- try:
1638
- if not self.automation_status['last_automated_training']:
1639
- return False
1640
-
1641
- last_training = datetime.fromisoformat(self.automation_status['last_automated_training'])
1642
- cooldown_hours = self.automation_config['retraining_conditions']['cooldown_hours_after_training']
1643
- cooldown_period = timedelta(hours=cooldown_hours)
1644
-
1645
- return datetime.now() - last_training < cooldown_period
1646
-
1647
- except Exception as e:
1648
- logger.warning(f"Failed to check cooldown period: {e}")
1649
- return False
1650
-
1651
- def exceeded_daily_retraining_limit(self) -> bool:
1652
- """Check if daily retraining limit has been exceeded"""
1653
- try:
1654
- if not self.automation_status['last_automated_training']:
1655
- return False
1656
-
1657
- last_training = datetime.fromisoformat(self.automation_status['last_automated_training'])
1658
-
1659
- # Count trainings in last 24 hours
1660
- training_logs = self.get_recent_automation_logs(hours=24)
1661
- training_count = len([log for log in training_logs if log.get('event') == 'retraining_success'])
1662
-
1663
- max_daily = self.automation_config['monitoring_schedule']['max_daily_retrainings']
1664
-
1665
- return training_count >= max_daily
1666
-
1667
- except Exception as e:
1668
- logger.warning(f"Failed to check daily limit: {e}")
1669
- return False
1670
-
1671
- def load_retraining_queue(self) -> List[Dict]:
1672
- """Load retraining queue from file"""
1673
- try:
1674
- if self.retraining_queue_path.exists():
1675
- with open(self.retraining_queue_path, 'r') as f:
1676
- return json.load(f)
1677
- return []
1678
- except Exception as e:
1679
- logger.error(f"Failed to load retraining queue: {e}")
1680
- return []
1681
-
1682
- def save_retraining_queue(self, queue: List[Dict]):
1683
- """Save retraining queue to file"""
1684
- try:
1685
- with open(self.retraining_queue_path, 'w') as f:
1686
- json.dump(queue, f, indent=2)
1687
- except Exception as e:
1688
- logger.error(f"Failed to save retraining queue: {e}")
1689
-
1690
- def log_automation_event(self, event: str, message: str, details: Dict = None):
1691
- """Log automation events with enhanced feature information"""
1692
- try:
1693
- log_entry = {
1694
- 'timestamp': datetime.now().isoformat(),
1695
- 'event': event,
1696
- 'message': message,
1697
- 'details': details or {},
1698
- 'enhanced_features_available': self.enhanced_features_available
1699
- }
1700
-
1701
- # Load existing logs
1702
- logs = []
1703
- if self.automation_log_path.exists():
1704
- try:
1705
- with open(self.automation_log_path, 'r') as f:
1706
- logs = json.load(f)
1707
- except:
1708
- logs = []
1709
-
1710
- logs.append(log_entry)
1711
-
1712
- # Keep only last 1000 entries
1713
- if len(logs) > 1000:
1714
- logs = logs[-1000:]
1715
-
1716
- # Save logs
1717
- with open(self.automation_log_path, 'w') as f:
1718
- json.dump(logs, f, indent=2)
1719
-
1720
- except Exception as e:
1721
- logger.error(f"Failed to log automation event: {e}")
1722
-
1723
- def get_recent_automation_logs(self, hours: int = 24) -> List[Dict]:
1724
- """Get recent automation logs"""
1725
- try:
1726
- if not self.automation_log_path.exists():
1727
- return []
1728
-
1729
- with open(self.automation_log_path, 'r') as f:
1730
- logs = json.load(f)
1731
-
1732
- cutoff_time = datetime.now() - timedelta(hours=hours)
1733
- recent_logs = [
1734
- log for log in logs
1735
- if datetime.fromisoformat(log['timestamp']) > cutoff_time
1736
- ]
1737
-
1738
- return recent_logs
1739
-
1740
- except Exception as e:
1741
- logger.error(f"Failed to get recent logs: {e}")
1742
- return []
1743
-
1744
- def get_automation_status(self) -> Dict:
1745
- """Get current automation status with enhanced feature information"""
1746
- try:
1747
- status = {
1748
- **self.automation_status,
1749
- 'monitoring_active': self.retraining_active,
1750
- 'last_check_time': self.last_check_time.isoformat() if self.last_check_time else None,
1751
- 'in_cooldown': self.is_in_cooldown_period(),
1752
- 'daily_limit_exceeded': self.exceeded_daily_retraining_limit(),
1753
- 'queued_jobs': len(self.load_retraining_queue()),
1754
- 'recent_logs': self.get_recent_automation_logs(hours=6),
1755
- 'enhanced_features_status': {
1756
- 'available': self.enhanced_features_available,
1757
- 'preference': self.automation_config['retraining_conditions'].get('prefer_enhanced_features', True)
1758
- }
1759
- }
1760
-
1761
- return status
1762
-
1763
- except Exception as e:
1764
- logger.error(f"Failed to get automation status: {e}")
1765
- return {'error': str(e)}
1766
 
1767
  def trigger_manual_retraining(self, reason: str = "manual_trigger", use_enhanced: bool = None) -> Dict:
1768
  """Manually trigger retraining with enhanced feature options"""
1769
  try:
1770
- # Use enhanced features by default if available
1771
  if use_enhanced is None:
1772
  use_enhanced = self.enhanced_features_available
1773
 
1774
- # Create manual trigger results
1775
- trigger_results = {
1776
- 'should_retrain': True,
1777
- 'urgency': 'high',
1778
- 'trigger_reason': reason,
1779
- 'triggers_detected': [{
1780
- 'type': 'manual_trigger',
1781
- 'severity': 'high',
1782
- 'message': f'Manual enhanced retraining triggered: {reason}'
1783
- }],
1784
- 'recommendations': ['Manual enhanced retraining requested'],
1785
- 'enhanced_features_requested': use_enhanced
1786
- }
1787
 
1788
- # Queue the retraining
1789
- self.queue_retraining(trigger_results, forced=True)
1790
 
1791
  feature_info = " with enhanced features" if use_enhanced else " with standard features"
1792
- return {
1793
- 'success': True,
1794
- 'message': f'Manual enhanced retraining queued{feature_info}',
1795
- 'enhanced_features': use_enhanced
1796
- }
 
 
 
 
 
 
 
1797
 
1798
  except Exception as e:
1799
  logger.error(f"Manual enhanced retraining trigger failed: {e}")
1800
  return {'success': False, 'error': str(e)}
1801
 
1802
 
1803
- def start_automation_system():
1804
- """Start the enhanced automated retraining system"""
1805
- try:
1806
- automation_manager = AutomatedRetrainingManager()
1807
- automation_manager.start_automated_monitoring()
1808
- return automation_manager
1809
- except Exception as e:
1810
- logger.error(f"Failed to start enhanced automation system: {e}")
1811
- return None
1812
-
1813
- def get_automation_manager() -> Optional[AutomatedRetrainingManager]:
1814
- """Get or create enhanced automation manager instance"""
1815
- global _automation_manager
1816
-
1817
- if '_automation_manager' not in globals():
1818
- _automation_manager = AutomatedRetrainingManager()
1819
-
1820
- return _automation_manager
1821
-
1822
  def main():
1823
- """Main execution function with enhanced CV and feature engineering"""
1824
  retrainer = EnhancedModelRetrainer()
1825
  success, message = retrainer.retrain_model()
1826
 
@@ -1830,5 +1793,6 @@ def main():
1830
  print(f"❌ {message}")
1831
  exit(1)
1832
 
 
1833
  if __name__ == "__main__":
1834
  main()
 
1
+ # Enhanced version with LightGBM, ensemble voting, and comprehensive cross-validation
 
2
 
3
  import json
4
  import shutil
5
  import joblib
 
6
  import logging
7
  import hashlib
8
  import schedule
 
25
  roc_auc_score, confusion_matrix, classification_report
26
  )
27
  from sklearn.model_selection import (
28
+ cross_val_score, StratifiedKFold, cross_validate, train_test_split, GridSearchCV
29
  )
30
  from sklearn.feature_extraction.text import TfidfVectorizer
31
  from sklearn.linear_model import LogisticRegression
32
+ from sklearn.ensemble import RandomForestClassifier, VotingClassifier
33
  from sklearn.pipeline import Pipeline
34
  from sklearn.preprocessing import FunctionTransformer
35
  from sklearn.feature_selection import SelectKBest, chi2
36
 
37
+ # Import LightGBM
38
+ import lightgbm as lgb
39
+
40
  # Import enhanced feature engineering components
41
  try:
42
  from features.feature_engineer import AdvancedFeatureEngineer, create_enhanced_pipeline, analyze_feature_importance
 
434
  except Exception as e:
435
  logger.error(f"Metric comparison failed for {metric}: {e}")
436
  return {'metric': metric, 'error': str(e)}
437
+
438
+
439
+ class EnsembleManager:
440
+ """Manage ensemble model creation and validation for retraining (matching train.py)"""
441
 
442
+ def __init__(self, random_state: int = 42):
443
+ self.random_state = random_state
444
 
445
+ def create_ensemble(self, individual_models: Dict[str, Any],
446
+ voting: str = 'soft') -> VotingClassifier:
447
+ """Create ensemble from individual models"""
448
+
449
+ estimators = [(name, model) for name, model in individual_models.items()]
450
+
451
+ ensemble = VotingClassifier(
452
+ estimators=estimators,
453
+ voting=voting,
454
+ n_jobs=1 # CPU optimization for HFS
455
+ )
456
+
457
+ logger.info(f"Created {voting} voting ensemble with {len(estimators)} models for retraining")
458
+ return ensemble
459
+
460
+ def evaluate_ensemble_vs_individuals(self, ensemble, individual_models: Dict,
461
+ X_test, y_test) -> Dict:
462
+ """Compare ensemble performance against individual models"""
463
+
464
+ results = {}
465
+
466
+ # Evaluate individual models
467
+ for name, model in individual_models.items():
468
+ y_pred = model.predict(X_test)
469
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
+ results[name] = {
472
+ 'accuracy': float(accuracy_score(y_test, y_pred)),
473
+ 'precision': float(precision_score(y_test, y_pred, average='weighted')),
474
+ 'recall': float(recall_score(y_test, y_pred, average='weighted')),
475
+ 'f1': float(f1_score(y_test, y_pred, average='weighted')),
476
+ 'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
477
+ }
478
+
479
+ # Evaluate ensemble
480
+ y_pred_ensemble = ensemble.predict(X_test)
481
+ y_pred_proba_ensemble = ensemble.predict_proba(X_test)[:, 1]
482
+
483
+ results['ensemble'] = {
484
+ 'accuracy': float(accuracy_score(y_test, y_pred_ensemble)),
485
+ 'precision': float(precision_score(y_test, y_pred_ensemble, average='weighted')),
486
+ 'recall': float(recall_score(y_test, y_pred_ensemble, average='weighted')),
487
+ 'f1': float(f1_score(y_test, y_pred_ensemble, average='weighted')),
488
+ 'roc_auc': float(roc_auc_score(y_test, y_pred_proba_ensemble))
489
+ }
490
+
491
+ # Calculate improvement over best individual model
492
+ best_individual_f1 = max(results[name]['f1'] for name in individual_models.keys())
493
+ ensemble_f1 = results['ensemble']['f1']
494
+ improvement = ensemble_f1 - best_individual_f1
495
+
496
+ results['ensemble_analysis'] = {
497
+ 'best_individual_f1': best_individual_f1,
498
+ 'ensemble_f1': ensemble_f1,
499
+ 'improvement': improvement,
500
+ 'improvement_percentage': (improvement / best_individual_f1) * 100 if best_individual_f1 > 0 else 0,
501
+ 'is_better': improvement > 0
502
+ }
503
+
504
+ return results
505
+
506
+ def statistical_ensemble_comparison(self, ensemble, individual_models: Dict,
507
+ X, y, cv_manager) -> Dict:
508
+ """Perform statistical comparison between ensemble and individual models"""
509
+
510
+ cv_strategy = cv_manager.create_cv_strategy(X, y)
511
+
512
+ results = {}
513
+
514
+ # Get CV results for ensemble
515
+ ensemble_cv = cv_manager.perform_model_cv_evaluation(ensemble, X, y, cv_strategy)
516
+ results['ensemble'] = ensemble_cv
517
+
518
+ # Get CV results for individual models
519
+ individual_cv_results = {}
520
+ for name, model in individual_models.items():
521
+ model_cv = cv_manager.perform_model_cv_evaluation(model, X, y, cv_strategy)
522
+ individual_cv_results[name] = model_cv
523
+ results[name] = model_cv
524
+
525
+ # Compare ensemble with each individual model
526
+ comparisons = {}
527
+ for name, model_cv in individual_cv_results.items():
528
+ comparison = cv_manager._compare_metric_scores(
529
+ model_cv['test_scores']['f1']['scores'] if 'test_scores' in model_cv and 'f1' in model_cv['test_scores'] else [],
530
+ ensemble_cv['test_scores']['f1']['scores'] if 'test_scores' in ensemble_cv and 'f1' in ensemble_cv['test_scores'] else [],
531
+ 'f1', name, 'ensemble'
532
+ )
533
+ comparisons[f'ensemble_vs_{name}'] = comparison
534
+
535
+ results['statistical_comparisons'] = comparisons
536
+
537
+ # Determine if ensemble should be used
538
+ ensemble_f1_scores = ensemble_cv.get('test_scores', {}).get('f1', {}).get('scores', [])
539
+
540
+ significantly_better_count = 0
541
+ for comparison in comparisons.values():
542
+ if comparison.get('tests', {}).get('paired_ttest', {}).get('significant', False) and comparison.get('improvement', 0) > 0:
543
+ significantly_better_count += 1
544
+
545
+ results['ensemble_recommendation'] = {
546
+ 'use_ensemble': significantly_better_count > 0,
547
+ 'significantly_better_than': significantly_better_count,
548
+ 'total_comparisons': len(comparisons),
549
+ 'confidence': significantly_better_count / len(comparisons) if comparisons else 0
550
+ }
551
+
552
+ return results
553
 
554
 
555
  class EnhancedModelRetrainer:
556
+ """Production-ready model retraining with LightGBM, enhanced features, and ensemble voting"""
557
 
558
  def __init__(self):
559
  self.setup_paths()
560
  self.setup_retraining_config()
561
  self.setup_statistical_tests()
562
+ self.setup_models() # Add LightGBM and ensemble management
563
  self.cv_comparator = CVModelComparator()
564
+ self.ensemble_manager = EnsembleManager()
565
 
566
  # Enhanced feature engineering settings
567
  self.enhanced_features_available = ENHANCED_FEATURES_AVAILABLE
568
  self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE # Default to enhanced if available
569
+ self.enable_ensemble = True # Enable ensemble by default
570
 
571
+ logger.info(f"Enhanced retraining initialized with features: {'enhanced' if self.use_enhanced_features else 'standard'}, ensemble: {self.enable_ensemble}")
572
 
573
  def setup_paths(self):
574
  """Setup all necessary paths"""
 
617
  self.max_retries = 3
618
  self.backup_retention_days = 30
619
 
620
+ # Enhanced feature configuration matching train.py
621
+ if self.use_enhanced_features:
622
+ self.max_features = 7500
623
+ self.feature_selection_k = 3000
624
+ else:
625
+ self.max_features = 5000
626
+ self.feature_selection_k = 2000
627
+
628
+ self.min_df = 1
629
+ self.max_df = 0.95
630
+ self.ngram_range = (1, 2)
631
+ self.max_iter = 500
632
+ self.class_weight = 'balanced'
633
 
634
  def setup_statistical_tests(self):
635
  """Setup statistical test configurations"""
 
639
  'mcnemar': {'alpha': 0.05, 'name': "McNemar's Test"}
640
  }
641
 
642
+ def setup_models(self):
643
+ """Setup model configurations including LightGBM (matching train.py)"""
644
+ self.models = {
645
+ 'logistic_regression': {
646
+ 'model': LogisticRegression(
647
+ max_iter=self.max_iter,
648
+ class_weight=self.class_weight,
649
+ random_state=self.random_state,
650
+ n_jobs=1 # CPU optimization
651
+ ),
652
+ 'param_grid': {
653
+ 'model__C': [0.1, 1, 10],
654
+ 'model__penalty': ['l2']
655
+ }
656
+ },
657
+ 'random_forest': {
658
+ 'model': RandomForestClassifier(
659
+ n_estimators=50, # Reduced for CPU efficiency
660
+ class_weight=self.class_weight,
661
+ random_state=self.random_state,
662
+ n_jobs=1 # CPU optimization
663
+ ),
664
+ 'param_grid': {
665
+ 'model__n_estimators': [50, 100],
666
+ 'model__max_depth': [10, None]
667
+ }
668
+ },
669
+ 'lightgbm': {
670
+ 'model': lgb.LGBMClassifier(
671
+ objective='binary',
672
+ boosting_type='gbdt',
673
+ num_leaves=31,
674
+ max_depth=10,
675
+ learning_rate=0.1,
676
+ n_estimators=100,
677
+ class_weight=self.class_weight,
678
+ random_state=self.random_state,
679
+ n_jobs=1, # CPU optimization
680
+ verbose=-1 # Suppress LightGBM output
681
+ ),
682
+ 'param_grid': {
683
+ 'model__n_estimators': [50, 100],
684
+ 'model__learning_rate': [0.05, 0.1],
685
+ 'model__num_leaves': [15, 31]
686
+ }
687
+ }
688
+ }
689
+
690
  def detect_production_feature_type(self) -> str:
691
  """Detect what type of features the production model uses"""
692
  try:
 
834
  logger.info(f"Data cleaning: {initial_count} -> {len(df)} samples")
835
  return df
836
 
837
+ def create_preprocessing_pipeline(self, use_enhanced: bool = None) -> Pipeline:
838
+ """Create preprocessing pipeline with optional enhanced features (matching train.py)"""
839
 
840
+ if use_enhanced is None:
841
+ use_enhanced = self.use_enhanced_features
842
 
843
+ if use_enhanced and ENHANCED_FEATURES_AVAILABLE:
844
  logger.info("Creating enhanced feature engineering pipeline for retraining...")
845
 
846
+ # Create enhanced feature engineer
847
  feature_engineer = AdvancedFeatureEngineer(
848
+ enable_sentiment=True,
849
+ enable_readability=True,
850
+ enable_entities=True,
851
+ enable_linguistic=True,
852
+ feature_selection_k=self.feature_selection_k,
853
+ tfidf_max_features=self.max_features,
854
+ ngram_range=self.ngram_range,
855
+ min_df=self.min_df,
856
+ max_df=self.max_df
857
  )
858
 
859
  # Create pipeline with enhanced features
860
  pipeline = Pipeline([
861
  ('enhanced_features', feature_engineer),
862
+ ('model', None) # Will be set during training
 
 
 
 
863
  ])
864
 
865
  return pipeline
 
867
  else:
868
  logger.info("Creating standard TF-IDF pipeline for retraining...")
869
 
870
+ # Use the standalone function instead of lambda
871
+ text_preprocessor = FunctionTransformer(
872
+ func=preprocess_text_function,
873
+ validate=False
874
+ )
875
+
876
+ # TF-IDF vectorization with optimized parameters
877
+ vectorizer = TfidfVectorizer(
878
+ max_features=self.max_features,
879
+ min_df=self.min_df,
880
+ max_df=self.max_df,
881
+ ngram_range=self.ngram_range,
882
+ stop_words='english',
883
+ sublinear_tf=True,
884
+ norm='l2'
885
+ )
886
+
887
+ # Feature selection
888
+ feature_selector = SelectKBest(
889
+ score_func=chi2,
890
+ k=min(self.feature_selection_k, self.max_features)
891
+ )
892
+
893
  # Create standard pipeline
894
  pipeline = Pipeline([
895
+ ('preprocess', text_preprocessor),
896
+ ('vectorize', vectorizer),
897
+ ('feature_select', feature_selector),
898
+ ('model', None) # Will be set during training
 
 
 
 
 
 
 
 
 
 
 
899
  ])
900
+
901
+ return pipeline
902
+
903
+ def hyperparameter_tuning_with_cv(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
904
+ """Perform hyperparameter tuning with nested cross-validation (matching train.py)"""
905
+
906
+ logger.info(f"Tuning {model_name} for retraining with {'enhanced' if self.use_enhanced_features else 'standard'} features")
907
+
908
+ try:
909
+ # Set the model in the pipeline
910
+ pipeline.set_params(model=self.models[model_name]['model'])
911
+
912
+ # Skip hyperparameter tuning for very small datasets
913
+ if len(X_train) < 20:
914
+ logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
915
+ pipeline.fit(X_train, y_train)
916
+
917
+ # Still perform CV evaluation
918
+ cv_results = self.cv_comparator.perform_model_cv_evaluation(pipeline, X_train, y_train)
919
+
920
+ return pipeline, {
921
+ 'best_params': 'default_parameters',
922
+ 'best_score': cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'not_calculated'),
923
+ 'best_estimator': pipeline,
924
+ 'cross_validation': cv_results,
925
+ 'note': 'Hyperparameter tuning skipped for small dataset'
926
+ }
927
+
928
+ # Get parameter grid
929
+ param_grid = self.models[model_name]['param_grid']
930
+
931
+ # Create CV strategy
932
+ cv_strategy = self.cv_comparator.create_cv_strategy(X_train, y_train)
933
 
934
+ # Create GridSearchCV with nested cross-validation
935
+ grid_search = GridSearchCV(
936
+ pipeline,
937
+ param_grid,
938
+ cv=cv_strategy,
939
+ scoring='f1_weighted',
940
+ n_jobs=1, # Single job for CPU optimization
941
+ verbose=0, # Reduce verbosity for speed
942
+ return_train_score=True # For overfitting analysis
943
+ )
944
+
945
+ # Fit grid search
946
+ logger.info(f"Starting hyperparameter tuning for {model_name}...")
947
+ grid_search.fit(X_train, y_train)
948
+
949
+ # Perform additional CV on best model
950
+ logger.info(f"Performing final CV evaluation for {model_name}...")
951
+ best_cv_results = self.cv_comparator.perform_model_cv_evaluation(
952
+ grid_search.best_estimator_, X_train, y_train, cv_strategy
953
+ )
954
+
955
+ # Extract results
956
+ tuning_results = {
957
+ 'best_params': grid_search.best_params_,
958
+ 'best_score': float(grid_search.best_score_),
959
+ 'best_estimator': grid_search.best_estimator_,
960
+ 'cv_folds_used': cv_strategy.n_splits,
961
+ 'cross_validation': best_cv_results,
962
+ 'grid_search_results': {
963
+ 'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
964
+ 'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
965
+ 'mean_train_scores': grid_search.cv_results_.get('mean_train_score', []).tolist() if 'mean_train_score' in grid_search.cv_results_ else [],
966
+ 'params': grid_search.cv_results_['params']
967
+ }
968
+ }
969
+
970
+ logger.info(f"Hyperparameter tuning completed for {model_name}")
971
+ logger.info(f"Best CV score: {grid_search.best_score_:.4f}")
972
+ logger.info(f"Best params: {grid_search.best_params_}")
973
+
974
+ if 'test_scores' in best_cv_results and 'f1' in best_cv_results['test_scores']:
975
+ final_f1 = best_cv_results['test_scores']['f1']['mean']
976
+ final_f1_std = best_cv_results['test_scores']['f1']['std']
977
+ logger.info(f"Final CV F1: {final_f1:.4f} (±{final_f1_std:.4f})")
978
+
979
+ return grid_search.best_estimator_, tuning_results
980
+
981
+ except Exception as e:
982
+ logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}")
983
+ # Return basic model if tuning fails
984
+ try:
985
+ pipeline.set_params(model=self.models[model_name]['model'])
986
+ pipeline.fit(X_train, y_train)
987
+
988
+ # Perform basic CV
989
+ cv_results = self.cv_comparator.perform_model_cv_evaluation(pipeline, X_train, y_train)
990
+
991
+ return pipeline, {
992
+ 'error': str(e),
993
+ 'fallback': 'simple_training',
994
+ 'cross_validation': cv_results
995
+ }
996
+ except Exception as e2:
997
+ logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
998
+ raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
999
+
1000
+ def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
1001
+ """Train and evaluate multiple models including LightGBM with enhanced features and ensemble (matching train.py)"""
1002
+
1003
+ results = {}
1004
+ individual_models = {}
1005
+
1006
+ for model_name in self.models.keys():
1007
+ logger.info(f"Training {model_name} for retraining with {'enhanced' if self.use_enhanced_features else 'standard'} features...")
1008
+
1009
+ try:
1010
+ # Create pipeline (enhanced or standard)
1011
+ pipeline = self.create_preprocessing_pipeline()
1012
+
1013
+ # Hyperparameter tuning with CV
1014
+ best_model, tuning_results = self.hyperparameter_tuning_with_cv(
1015
+ pipeline, X_train, y_train, model_name
1016
+ )
1017
+
1018
+ # Store results
1019
+ results[model_name] = {
1020
+ 'model': best_model,
1021
+ 'tuning_results': tuning_results,
1022
+ 'training_time': datetime.now().isoformat(),
1023
+ 'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
1024
+ }
1025
+
1026
+ # Store for ensemble creation
1027
+ individual_models[model_name] = best_model
1028
+
1029
+ # Log results
1030
+ cv_results = tuning_results.get('cross_validation', {})
1031
+ cv_f1_mean = cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'N/A')
1032
+ cv_f1_std = cv_results.get('test_scores', {}).get('f1', {}).get('std', 'N/A')
1033
+
1034
+ logger.info(f"Model {model_name} - CV F1: {cv_f1_mean:.4f if cv_f1_mean != 'N/A' else cv_f1_mean} "
1035
+ f"(±{cv_f1_std:.4f if cv_f1_std != 'N/A' else cv_f1_std})")
1036
+
1037
+ except Exception as e:
1038
+ logger.error(f"Training failed for {model_name}: {str(e)}")
1039
+ results[model_name] = {'error': str(e)}
1040
+
1041
+ # Create and evaluate ensemble if enabled and we have multiple successful models
1042
+ if self.enable_ensemble and len(individual_models) >= 2:
1043
+ logger.info("Creating ensemble model for retraining...")
1044
+
1045
+ try:
1046
+ # Create ensemble
1047
+ ensemble = self.ensemble_manager.create_ensemble(individual_models, voting='soft')
1048
+
1049
+ # Fit ensemble
1050
+ X_full_train = np.concatenate([X_train, X_test])
1051
+ y_full_train = np.concatenate([y_train, y_test])
1052
+
1053
+ ensemble.fit(X_train, y_train)
1054
+
1055
+ # Compare ensemble with individual models using statistical tests
1056
+ statistical_comparison = self.ensemble_manager.statistical_ensemble_comparison(
1057
+ ensemble, individual_models, X_full_train, y_full_train, self.cv_comparator
1058
+ )
1059
+
1060
+ # Store ensemble results
1061
+ results['ensemble'] = {
1062
+ 'model': ensemble,
1063
+ 'statistical_comparison': statistical_comparison,
1064
+ 'training_time': datetime.now().isoformat(),
1065
+ 'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
1066
+ }
1067
+
1068
+ # Add ensemble to individual models for selection
1069
+ individual_models['ensemble'] = ensemble
1070
+
1071
+ # Log ensemble results
1072
+ recommendation = statistical_comparison.get('ensemble_recommendation', {})
1073
+ if recommendation.get('use_ensemble', False):
1074
+ logger.info(f"✅ Ensemble recommended for retraining (confidence: {recommendation.get('confidence', 0):.2f})")
1075
+ else:
1076
+ logger.info(f"❌ Ensemble not recommended for retraining")
1077
+
1078
+ except Exception as e:
1079
+ logger.error(f"Ensemble creation failed for retraining: {str(e)}")
1080
+ results['ensemble'] = {'error': str(e)}
1081
+
1082
+ return results
1083
 
1084
+ def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
1085
+ """Select the best performing model based on CV results with ensemble consideration (matching train.py)"""
1086
+
1087
+ best_model_name = None
1088
+ best_model = None
1089
+ best_score = -1
1090
+ best_metrics = None
1091
+
1092
+ # Consider ensemble first if it exists and is recommended
1093
+ if 'ensemble' in results and 'error' not in results['ensemble']:
1094
+ ensemble_result = results['ensemble']
1095
+ statistical_comparison = ensemble_result.get('statistical_comparison', {})
1096
+ recommendation = statistical_comparison.get('ensemble_recommendation', {})
1097
+
1098
+ if recommendation.get('use_ensemble', False):
1099
+ ensemble_cv = statistical_comparison.get('ensemble', {})
1100
+
1101
+ if 'test_scores' in ensemble_cv and 'f1' in ensemble_cv['test_scores']:
1102
+ f1_score = ensemble_cv['test_scores']['f1']['mean']
1103
+ if f1_score > best_score:
1104
+ best_score = f1_score
1105
+ best_model_name = 'ensemble'
1106
+ best_model = ensemble_result['model']
1107
+ best_metrics = {'cross_validation': ensemble_cv}
1108
+ logger.info("✅ Ensemble selected as best model for retraining")
1109
+
1110
+ # If ensemble not selected, choose best individual model
1111
+ if best_model_name is None:
1112
+ for model_name, result in results.items():
1113
+ if 'error' in result or model_name == 'ensemble':
1114
+ continue
1115
+
1116
+ # Prioritize CV F1 score if available
1117
+ tuning_results = result.get('tuning_results', {})
1118
+ cv_results = tuning_results.get('cross_validation', {})
1119
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
1120
+ f1_score = cv_results['test_scores']['f1']['mean']
1121
+ score_type = "CV F1"
1122
+ else:
1123
+ f1_score = tuning_results.get('best_score', 0)
1124
+ score_type = "Grid Search F1"
1125
+
1126
+ if f1_score > best_score:
1127
+ best_score = f1_score
1128
+ best_model_name = model_name
1129
+ best_model = result['model']
1130
+ best_metrics = {'cross_validation': cv_results} if cv_results else tuning_results
1131
+
1132
+ if best_model_name is None:
1133
+ raise ValueError("No models trained successfully for retraining")
1134
+
1135
+ score_type = "CV F1" if 'cross_validation' in best_metrics else "Grid Search F1"
1136
+ logger.info(f"Best model for retraining: {best_model_name} with {score_type} score: {best_score:.4f}")
1137
+ return best_model_name, best_model, best_metrics
1138
+
1139
  def train_candidate_model(self, df: pd.DataFrame) -> Tuple[bool, Optional[Any], Dict]:
1140
  """Train candidate model with enhanced features and comprehensive CV evaluation"""
1141
  try:
1142
+ logger.info("Training candidate model with enhanced feature engineering and LightGBM...")
1143
 
1144
  # Prepare data
1145
  X = df['text'].values
 
1151
 
1152
  logger.info(f"Training candidate with {candidate_feature_type} features (production uses {prod_feature_type})")
1153
 
 
 
 
 
 
 
 
 
 
 
1154
  # Additional holdout evaluation
1155
  X_train, X_test, y_train, y_test = train_test_split(
1156
  X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
1157
  )
1158
 
1159
+ # Train and evaluate models including LightGBM and ensemble
1160
+ results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
1161
+
1162
+ # Select best model (could be ensemble)
1163
+ best_model_name, best_model, best_metrics = self.select_best_model(results)
1164
+
1165
+ # Train final model on full dataset
1166
+ final_pipeline = self.create_preprocessing_pipeline(self.use_enhanced_features)
1167
+
1168
+ # Replace model component with selected best model
1169
+ if hasattr(best_model, 'named_steps') and 'model' in best_model.named_steps:
1170
+ final_pipeline.set_params(model=best_model.named_steps['model'])
1171
+ elif best_model_name == 'ensemble':
1172
+ # For ensemble, we need to recreate it with properly fitted individual models
1173
+ individual_models = {}
1174
+ for name, result in results.items():
1175
+ if name != 'ensemble' and 'error' not in result:
1176
+ # Retrain individual model on full data
1177
+ individual_pipeline = self.create_preprocessing_pipeline(self.use_enhanced_features)
1178
+ individual_pipeline.set_params(model=result['model'].named_steps['model'])
1179
+ individual_pipeline.fit(X, y)
1180
+ individual_models[name] = individual_pipeline
1181
+
1182
+ if len(individual_models) >= 2:
1183
+ final_ensemble = self.ensemble_manager.create_ensemble(individual_models, voting='soft')
1184
+ final_ensemble.fit(X, y)
1185
+ best_model = final_ensemble
1186
+ else:
1187
+ # Fallback to best individual model
1188
+ final_pipeline.fit(X, y)
1189
+ best_model = final_pipeline
1190
+ else:
1191
+ final_pipeline.fit(X, y)
1192
+ best_model = final_pipeline
1193
 
1194
  # Extract feature information if using enhanced features
1195
  feature_analysis = {}
1196
+ if self.use_enhanced_features and hasattr(best_model, 'named_steps'):
1197
+ feature_engineer = best_model.named_steps.get('enhanced_features')
1198
  if feature_engineer and hasattr(feature_engineer, 'get_feature_metadata'):
1199
  try:
1200
  feature_analysis = {
 
1202
  'feature_importance': feature_engineer.get_feature_importance(top_k=20) if hasattr(feature_engineer, 'get_feature_importance') else {},
1203
  'total_features': len(feature_engineer.get_feature_names()) if hasattr(feature_engineer, 'get_feature_names') else 0
1204
  }
1205
+ logger.info(f"Enhanced features extracted: {feature_analysis.get('total_features', 0)} total features")
1206
  except Exception as e:
1207
  logger.warning(f"Could not extract feature analysis: {e}")
1208
 
1209
+ # Perform final CV evaluation on the selected model
1210
+ cv_results = self.cv_comparator.perform_model_cv_evaluation(best_model, X, y)
1211
+
1212
+ # Combine results
1213
  evaluation_results = {
1214
  'cross_validation': cv_results,
 
1215
  'feature_analysis': feature_analysis,
1216
  'feature_type': candidate_feature_type,
1217
  'training_samples': len(X),
1218
+ 'test_samples': len(X_test),
1219
+ 'model_selection': {
1220
+ 'selected_model': best_model_name,
1221
+ 'selection_reason': f"Best {best_model_name} based on CV F1 score",
1222
+ 'all_results': {k: v for k, v in results.items() if 'error' not in v}
1223
+ }
1224
  }
1225
 
1226
  # Save candidate model
1227
+ joblib.dump(best_model, self.candidate_pipeline_path)
1228
+ if hasattr(best_model, 'named_steps'):
1229
+ if 'model' in best_model.named_steps:
1230
+ joblib.dump(best_model.named_steps['model'], self.candidate_model_path)
1231
 
1232
  # Save enhanced features or vectorizer
1233
+ if 'enhanced_features' in best_model.named_steps:
1234
+ feature_engineer = best_model.named_steps['enhanced_features']
1235
  if hasattr(feature_engineer, 'save_pipeline'):
1236
  feature_engineer.save_pipeline(self.candidate_feature_engineer_path)
1237
 
 
1243
  }
1244
  joblib.dump(enhanced_ref, self.candidate_vectorizer_path)
1245
 
1246
+ elif 'vectorize' in best_model.named_steps:
1247
+ joblib.dump(best_model.named_steps['vectorize'], self.candidate_vectorizer_path)
1248
+ elif best_model_name == 'ensemble':
1249
+ # Save ensemble directly
1250
+ joblib.dump(best_model, self.candidate_model_path)
1251
+ # Create dummy vectorizer reference for ensemble
1252
+ ensemble_ref = {'type': 'ensemble', 'model_type': best_model_name}
1253
+ joblib.dump(ensemble_ref, self.candidate_vectorizer_path)
1254
 
1255
  # Log results
1256
  if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
1257
  cv_f1_mean = cv_results['test_scores']['f1']['mean']
1258
  cv_f1_std = cv_results['test_scores']['f1']['std']
1259
+ logger.info(f"Candidate model ({best_model_name}) CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
1260
 
 
1261
  logger.info(f"Candidate model training completed with {candidate_feature_type} features")
1262
 
1263
+ return True, best_model, evaluation_results
1264
 
1265
  except Exception as e:
1266
  error_msg = f"Candidate model training failed: {str(e)}"
 
1393
 
1394
  # Extract metrics from candidate evaluation
1395
  cv_results = candidate_metrics.get('cross_validation', {})
 
1396
  feature_analysis = candidate_metrics.get('feature_analysis', {})
1397
+ model_selection = candidate_metrics.get('model_selection', {})
1398
 
1399
  # Update metadata with comprehensive information
1400
  metadata.update({
1401
  'model_version': new_version,
1402
+ 'model_type': 'enhanced_retrained_pipeline_cv_ensemble',
1403
  'previous_version': old_version,
1404
  'promotion_timestamp': datetime.now().isoformat(),
1405
+ 'retrain_trigger': 'enhanced_cv_validated_retrain_with_lightgbm_ensemble',
1406
  'training_samples': candidate_metrics.get('training_samples', 'Unknown'),
1407
+ 'test_samples': candidate_metrics.get('test_samples', 'Unknown'),
1408
+ 'selected_model': model_selection.get('selected_model', 'unknown')
1409
  })
1410
 
1411
  # Enhanced feature engineering metadata
 
1449
  except Exception as e:
1450
  logger.warning(f"Could not save feature analysis: {e}")
1451
 
 
 
 
 
 
 
 
 
 
 
1452
  # Add comprehensive CV results
1453
  if cv_results and 'test_scores' in cv_results:
1454
  metadata['cross_validation'] = {
 
1467
  'cv_f1_mean': cv_results['test_scores']['f1']['mean'],
1468
  'cv_f1_std': cv_results['test_scores']['f1']['std'],
1469
  'cv_f1_min': cv_results['test_scores']['f1']['min'],
1470
+ 'cv_f1_max': cv_results['test_scores']['f1']['max'],
1471
+ 'test_f1': cv_results['test_scores']['f1']['mean'], # For compatibility
1472
+ 'test_accuracy': cv_results['test_scores'].get('accuracy', {}).get('mean', 'Unknown')
1473
  })
1474
 
1475
  # Add enhanced model comparison results
 
1477
  metadata['promotion_validation'] = {
1478
  'decision_confidence': promotion_decision.get('confidence', 'Unknown'),
1479
  'promotion_reason': promotion_decision.get('reason', 'Unknown'),
1480
+ 'comparison_method': 'enhanced_cv_statistical_tests_with_lightgbm_ensemble',
1481
  'feature_engineering_factor': promotion_decision.get('feature_engineering_factor', False),
1482
  'feature_upgrade_details': promotion_decision.get('feature_upgrade_details', {})
1483
  }
 
1495
  'tests': comparison.get('tests', {})
1496
  }
1497
 
1498
+ # Add model selection information
1499
+ metadata['model_selection_details'] = model_selection
1500
+ metadata['ensemble_enabled'] = self.enable_ensemble
1501
+ metadata['models_trained'] = list(self.models.keys())
1502
+
1503
  # Save updated metadata
1504
  with open(self.metadata_path, 'w') as f:
1505
  json.dump(metadata, f, indent=2)
 
1510
  total_features = feature_analysis.get('total_features', 0)
1511
  feature_info = f" with {total_features} enhanced features"
1512
 
1513
+ selected_model = model_selection.get('selected_model', 'unknown')
1514
+ logger.info(f"Model promoted successfully to {new_version} (selected: {selected_model}){feature_info}")
1515
  logger.info(f"Promotion reason: {promotion_decision.get('reason', 'Enhanced CV validation passed')}")
1516
 
1517
  return True
 
1527
  'timestamp': datetime.now().isoformat(),
1528
  'results': results,
1529
  'session_id': hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8],
1530
+ 'retraining_type': 'enhanced_cv_features_lightgbm_ensemble',
1531
  'enhanced_features_used': self.use_enhanced_features,
1532
+ 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
1533
+ 'ensemble_enabled': self.enable_ensemble
1534
  }
1535
 
1536
  # Load existing logs
 
1570
  'enhanced_features_info': {
1571
  'used': self.use_enhanced_features,
1572
  'available': ENHANCED_FEATURES_AVAILABLE,
1573
+ 'feature_comparison': results['comparison_results'].get('feature_engineering_comparison', {}),
1574
+ 'ensemble_enabled': self.enable_ensemble
1575
  }
1576
  }
1577
 
 
1586
  logger.error(f"Failed to log enhanced retraining session: {str(e)}")
1587
 
1588
  def retrain_model(self) -> Tuple[bool, str]:
1589
+ """Main retraining function with enhanced feature engineering, LightGBM, and ensemble voting"""
1590
  try:
1591
+ logger.info("Starting enhanced model retraining with LightGBM and ensemble capabilities...")
1592
 
1593
  # Load existing metadata
1594
  existing_metadata = self.load_existing_metadata()
 
1599
  logger.warning(f"No production model found: {prod_msg}")
1600
  # Fall back to initial training
1601
  try:
1602
+ from train import main as train_main
1603
  train_main()
1604
  return True, "Initial enhanced training completed"
1605
  except ImportError:
 
1619
  candidate_feature_type = 'enhanced' if self.use_enhanced_features else 'standard'
1620
 
1621
  logger.info(f"Retraining strategy: {prod_feature_type} -> {candidate_feature_type}")
1622
+ logger.info(f"Models to train: {list(self.models.keys())}")
1623
+ logger.info(f"Ensemble enabled: {self.enable_ensemble}")
1624
 
1625
+ # Train candidate model with enhanced features, LightGBM, and ensemble
1626
  candidate_success, candidate_model, candidate_metrics = self.train_candidate_model(df)
1627
  if not candidate_success:
1628
  return False, f"Enhanced candidate training failed: {candidate_metrics.get('error', 'Unknown error')}"
 
1642
  'comparison_results': comparison_results,
1643
  'data_size': len(df),
1644
  'cv_folds': self.cv_folds,
1645
+ 'retraining_method': 'enhanced_cv_features_lightgbm_ensemble',
1646
  'feature_engineering': {
1647
  'production_type': prod_feature_type,
1648
  'candidate_type': candidate_feature_type,
1649
  'feature_upgrade': comparison_results.get('feature_engineering_comparison', {})
1650
+ },
1651
+ 'models_trained': list(self.models.keys()),
1652
+ 'ensemble_enabled': self.enable_ensemble,
1653
+ 'selected_model': candidate_metrics.get('model_selection', {}).get('selected_model', 'unknown')
1654
  }
1655
 
1656
  self.log_retraining_session(session_results)
 
1671
  improvement = f1_comp.get('improvement', 0)
1672
  confidence = promotion_decision.get('confidence', 0)
1673
  feature_upgrade = promotion_decision.get('feature_engineering_factor', False)
1674
+ selected_model = candidate_metrics.get('model_selection', {}).get('selected_model', 'unknown')
1675
 
1676
  feature_info = ""
1677
  if feature_upgrade:
 
1679
  elif candidate_feature_type == 'enhanced':
1680
  feature_info = " using enhanced features"
1681
 
1682
+ model_info = f" (selected: {selected_model})"
1683
+ if self.enable_ensemble and selected_model == 'ensemble':
1684
+ model_info += " - ensemble model with LightGBM"
1685
+
1686
  success_msg = (
1687
+ f"Enhanced model promoted successfully{feature_info}{model_info}! "
1688
  f"F1 improvement: {improvement:.4f}, "
1689
  f"Confidence: {confidence:.2f}, "
1690
  f"Reason: {promotion_decision.get('reason', 'Enhanced CV validation passed')}"
 
1697
  # Keep current model
1698
  reason = promotion_decision.get('reason', 'No significant improvement detected')
1699
  confidence = promotion_decision.get('confidence', 0)
1700
+ selected_model = candidate_metrics.get('model_selection', {}).get('selected_model', 'unknown')
1701
 
1702
  keep_msg = (
1703
  f"Keeping current model based on enhanced CV analysis. "
1704
+ f"Candidate was {selected_model}, "
1705
  f"Reason: {reason}, "
1706
  f"Confidence: {confidence:.2f}"
1707
  )
 
1733
  return False, f"Automated enhanced retraining failed: {str(e)}"
1734
 
1735
 
1736
+ # Simplified AutomatedRetrainingManager for brevity - keeping core functionality
1737
  class AutomatedRetrainingManager:
1738
  """Manages automated retraining triggers and scheduling with enhanced features"""
1739
 
1740
  def __init__(self, base_dir: Path = None):
1741
  self.base_dir = base_dir or Path("/tmp")
1742
  self.setup_automation_paths()
 
1743
  self.drift_monitor = AdvancedDriftMonitor()
1744
  self.retraining_active = False
 
 
 
 
1745
  self.enhanced_features_available = ENHANCED_FEATURES_AVAILABLE
1746
 
 
 
 
 
 
 
 
 
1747
  logger.info(f"Automated retraining manager initialized with enhanced features: {self.enhanced_features_available}")
1748
 
1749
  def setup_automation_paths(self):
1750
  """Setup automation-specific paths"""
1751
  self.automation_dir = self.base_dir / "automation"
1752
  self.automation_dir.mkdir(parents=True, exist_ok=True)
 
1753
  self.automation_log_path = self.automation_dir / "automation_log.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1754
 
1755
  def trigger_manual_retraining(self, reason: str = "manual_trigger", use_enhanced: bool = None) -> Dict:
1756
  """Manually trigger retraining with enhanced feature options"""
1757
  try:
 
1758
  if use_enhanced is None:
1759
  use_enhanced = self.enhanced_features_available
1760
 
1761
+ retrainer = EnhancedModelRetrainer()
1762
+ retrainer.use_enhanced_features = use_enhanced and ENHANCED_FEATURES_AVAILABLE
 
 
 
 
 
 
 
 
 
 
 
1763
 
1764
+ success, result = retrainer.automated_retrain_with_validation()
 
1765
 
1766
  feature_info = " with enhanced features" if use_enhanced else " with standard features"
1767
+ if success:
1768
+ return {
1769
+ 'success': True,
1770
+ 'message': f'Manual enhanced retraining completed{feature_info}: {result}',
1771
+ 'enhanced_features': use_enhanced
1772
+ }
1773
+ else:
1774
+ return {
1775
+ 'success': False,
1776
+ 'message': f'Manual enhanced retraining failed{feature_info}: {result}',
1777
+ 'enhanced_features': use_enhanced
1778
+ }
1779
 
1780
  except Exception as e:
1781
  logger.error(f"Manual enhanced retraining trigger failed: {e}")
1782
  return {'success': False, 'error': str(e)}
1783
 
1784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1785
  def main():
1786
+ """Main execution function with enhanced CV, LightGBM, and ensemble support"""
1787
  retrainer = EnhancedModelRetrainer()
1788
  success, message = retrainer.retrain_model()
1789
 
 
1793
  print(f"❌ {message}")
1794
  exit(1)
1795
 
1796
+
1797
  if __name__ == "__main__":
1798
  main()