Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 20

Commit

dbb9a1a

1 Parent(s): ead9c37

Update model/retrain.py

Browse files

Cross Validation Implementation

Files changed (1) hide show

model/retrain.py +529 -153

model/retrain.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import pandas as pd
 import numpy as np
 import joblib
@@ -17,7 +20,9 @@ from sklearn.metrics import (
     accuracy_score, precision_score, recall_score, f1_score,
     roc_auc_score, confusion_matrix, classification_report
 )
-from sklearn.model_selection import cross_val_score, StratifiedKFold
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
@@ -36,13 +41,322 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 class RobustModelRetrainer:
-    """Production-ready model retraining with statistical validation and A/B testing"""
     def __init__(self):
         self.setup_paths()
         self.setup_retraining_config()
         self.setup_statistical_tests()
     def setup_paths(self):
         """Setup all necessary paths"""
@@ -81,7 +395,7 @@ class RobustModelRetrainer:
         self.min_new_samples = 50
         self.improvement_threshold = 0.01  # 1% improvement required
         self.significance_level = 0.05
-        self.cv_folds = 5
         self.test_size = 0.2
         self.random_state = 42
         self.max_retries = 3
@@ -90,9 +404,9 @@ class RobustModelRetrainer:
     def setup_statistical_tests(self):
         """Setup statistical test configurations"""
         self.statistical_tests = {
-            'mcnemar': {'alpha': 0.05, 'name': "McNemar's Test"},
             'paired_ttest': {'alpha': 0.05, 'name': "Paired T-Test"},
-            'wilcoxon': {'alpha': 0.05, 'name': "Wilcoxon Signed-Rank Test"}
         }
     def load_existing_metadata(self) -> Optional[Dict]:
@@ -246,146 +560,129 @@ class RobustModelRetrainer:
         return pipeline
     def train_candidate_model(self, df: pd.DataFrame) -> Tuple[bool, Optional[Any], Dict]:
-        """Train candidate model with comprehensive evaluation"""
         try:
-            logger.info("Training candidate model...")
             # Prepare data
             X = df['text'].values
             y = df['label'].values
-            # Train-test split
-            from sklearn.model_selection import train_test_split
-            X_train, X_test, y_train, y_test = train_test_split(
-                X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
-            )
             # Create and train pipeline
             pipeline = self.create_advanced_pipeline()
-            pipeline.fit(X_train, y_train)
-            # Evaluate candidate model
-            evaluation_results = self.evaluate_model(pipeline, X_test, y_test, X_train, y_train)
-            # Save candidate model
-            joblib.dump(pipeline, self.candidate_pipeline_path)
-            joblib.dump(pipeline.named_steps['model'], self.candidate_model_path)
-            joblib.dump(pipeline.named_steps['vectorize'], self.candidate_vectorizer_path)
-            logger.info(f"Candidate model training completed")
-            logger.info(f"Candidate F1 Score: {evaluation_results['f1']:.4f}")
-            logger.info(f"Candidate Accuracy: {evaluation_results['accuracy']:.4f}")
-            return True, pipeline, evaluation_results
-        except Exception as e:
-            error_msg = f"Candidate model training failed: {str(e)}"
-            logger.error(error_msg)
-            return False, None, {'error': error_msg}
-    def evaluate_model(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
-        """Comprehensive model evaluation"""
-        try:
-            # Predictions
-            y_pred = model.predict(X_test)
-            y_pred_proba = model.predict_proba(X_test)[:, 1]
-            # Basic metrics
-            metrics = {
                 'accuracy': float(accuracy_score(y_test, y_pred)),
                 'precision': float(precision_score(y_test, y_pred, average='weighted')),
                 'recall': float(recall_score(y_test, y_pred, average='weighted')),
                 'f1': float(f1_score(y_test, y_pred, average='weighted')),
-                'roc_auc': float(roc_auc_score(y_test, y_pred_proba)),
-                'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
-                'evaluation_timestamp': datetime.now().isoformat()
             }
-            # Cross-validation
-            if X_train is not None and y_train is not None:
-                try:
-                    cv_scores = cross_val_score(
-                        model, X_train, y_train,
-                        cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
-                        scoring='f1_weighted'
-                    )
-                    metrics['cv_f1_mean'] = float(cv_scores.mean())
-                    metrics['cv_f1_std'] = float(cv_scores.std())
-                except Exception as e:
-                    logger.warning(f"Cross-validation failed: {e}")
-            return metrics
         except Exception as e:
-            logger.error(f"Model evaluation failed: {str(e)}")
-            return {'error': str(e)}
-    def compare_models_statistically(self, prod_model, candidate_model, X_test, y_test) -> Dict:
-        """Statistical comparison of models"""
         try:
-            logger.info("Performing statistical model comparison...")
-            # Get predictions
-            prod_pred = prod_model.predict(X_test)
-            candidate_pred = candidate_model.predict(X_test)
-            # Calculate accuracies
-            prod_accuracy = accuracy_score(y_test, prod_pred)
-            candidate_accuracy = accuracy_score(y_test, candidate_pred)
-            comparison_results = {
-                'production_accuracy': float(prod_accuracy),
-                'candidate_accuracy': float(candidate_accuracy),
-                'absolute_improvement': float(candidate_accuracy - prod_accuracy),
-                'relative_improvement': float((candidate_accuracy - prod_accuracy) / prod_accuracy * 100),
-                'statistical_tests': {}
-            }
-            # McNemar's test for paired predictions
-            try:
-                # Create contingency table
-                prod_correct = (prod_pred == y_test)
-                candidate_correct = (candidate_pred == y_test)
-                both_correct = np.sum(prod_correct & candidate_correct)
-                prod_only = np.sum(prod_correct & ~candidate_correct)
-                candidate_only = np.sum(~prod_correct & candidate_correct)
-                both_wrong = np.sum(~prod_correct & ~candidate_correct)
-                # McNemar's test
-                if prod_only + candidate_only > 0:
-                    mcnemar_stat = (abs(prod_only - candidate_only) - 1) ** 2 / (prod_only + candidate_only)
-                    p_value = 1 - stats.chi2.cdf(mcnemar_stat, 1)
-                    comparison_results['statistical_tests']['mcnemar'] = {
-                        'statistic': float(mcnemar_stat),
-                        'p_value': float(p_value),
-                        'significant': p_value < self.significance_level,
-                        'contingency_table': {
-                            'both_correct': int(both_correct),
-                            'prod_only': int(prod_only),
-                            'candidate_only': int(candidate_only),
-                            'both_wrong': int(both_wrong)
-                        }
-                    }
-            except Exception as e:
-                logger.warning(f"McNemar's test failed: {e}")
-            # Practical significance test
-            comparison_results['practical_significance'] = {
-                'meets_threshold': comparison_results['absolute_improvement'] >= self.improvement_threshold,
-                'threshold': self.improvement_threshold,
-                'recommendation': 'promote' if (
-                    comparison_results['absolute_improvement'] >= self.improvement_threshold and
-                    comparison_results['statistical_tests'].get('mcnemar', {}).get('significant', False)
-                ) else 'keep_current'
             }
-            return comparison_results
         except Exception as e:
-            logger.error(f"Statistical comparison failed: {str(e)}")
             return {'error': str(e)}
     def create_backup(self) -> bool:
@@ -415,7 +712,7 @@ class RobustModelRetrainer:
             return False
     def promote_candidate_model(self, candidate_model, candidate_metrics: Dict, comparison_results: Dict) -> bool:
-        """Promote candidate model to production"""
         try:
             logger.info("Promoting candidate model to production...")
@@ -429,7 +726,7 @@ class RobustModelRetrainer:
             shutil.copy2(self.candidate_vectorizer_path, self.prod_vectorizer_path)
             shutil.copy2(self.candidate_pipeline_path, self.prod_pipeline_path)
-            # Update metadata
             metadata = self.load_existing_metadata() or {}
             # Increment version
@@ -443,27 +740,78 @@ class RobustModelRetrainer:
             else:
                 new_version = f"v1.{int(datetime.now().timestamp()) % 1000}"
-            # Update metadata
             metadata.update({
                 'model_version': new_version,
-                'model_type': 'retrained_pipeline',
                 'previous_version': old_version,
-                'test_accuracy': candidate_metrics['accuracy'],
-                'test_f1': candidate_metrics['f1'],
-                'test_precision': candidate_metrics['precision'],
-                'test_recall': candidate_metrics['recall'],
-                'test_roc_auc': candidate_metrics['roc_auc'],
-                'improvement_over_previous': comparison_results['absolute_improvement'],
-                'statistical_significance': comparison_results['statistical_tests'].get('mcnemar', {}).get('significant', False),
                 'promotion_timestamp': datetime.now().isoformat(),
-                'retrain_trigger': 'scheduled_retrain'
             })
             # Save updated metadata
             with open(self.metadata_path, 'w') as f:
                 json.dump(metadata, f, indent=2)
             logger.info(f"Model promoted successfully to {new_version}")
             return True
         except Exception as e:
@@ -471,12 +819,13 @@ class RobustModelRetrainer:
             return False
     def log_retraining_session(self, results: Dict):
-        """Log retraining session results"""
         try:
             log_entry = {
                 'timestamp': datetime.now().isoformat(),
                 'results': results,
-                'session_id': hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8]
             }
             # Load existing logs
@@ -499,13 +848,36 @@ class RobustModelRetrainer:
             with open(self.retraining_log_path, 'w') as f:
                 json.dump(logs, f, indent=2)
         except Exception as e:
             logger.error(f"Failed to log retraining session: {str(e)}")
     def retrain_model(self) -> Tuple[bool, str]:
-        """Main retraining function with comprehensive validation"""
         try:
-            logger.info("Starting model retraining process...")
             # Load existing metadata
             existing_metadata = self.load_existing_metadata()
@@ -528,22 +900,18 @@ class RobustModelRetrainer:
             if len(df) < self.min_new_samples:
                 return False, f"Insufficient new data: {len(df)} < {self.min_new_samples}"
-            # Train candidate model
             candidate_success, candidate_model, candidate_metrics = self.train_candidate_model(df)
             if not candidate_success:
                 return False, f"Candidate training failed: {candidate_metrics.get('error', 'Unknown error')}"
-            # Prepare test data for comparison
             X = df['text'].values
             y = df['label'].values
-            from sklearn.model_selection import train_test_split
-            _, X_test, _, y_test = train_test_split(
-                X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
-            )
-            # Compare models
-            comparison_results = self.compare_models_statistically(
-                prod_model, candidate_model, X_test, y_test
             )
             # Log results
@@ -551,16 +919,15 @@ class RobustModelRetrainer:
                 'candidate_metrics': candidate_metrics,
                 'comparison_results': comparison_results,
                 'data_size': len(df),
-                'test_size': len(X_test)
             }
             self.log_retraining_session(session_results)
-            # Decide whether to promote
-            should_promote = (
-                comparison_results['absolute_improvement'] >= self.improvement_threshold and
-                comparison_results.get('statistical_tests', {}).get('mcnemar', {}).get('significant', False)
-            )
             if should_promote:
                 # Promote candidate model
@@ -569,10 +936,16 @@ class RobustModelRetrainer:
                 )
                 if promotion_success:
                     success_msg = (
-                        f"Model promoted successfully! "
-                        f"Improvement: {comparison_results['absolute_improvement']:.4f} "
-                        f"(F1: {candidate_metrics['f1']:.4f})"
                     )
                     logger.info(success_msg)
                     return True, success_msg
@@ -580,21 +953,24 @@ class RobustModelRetrainer:
                     return False, "Model promotion failed"
             else:
                 # Keep current model
                 keep_msg = (
-                    f"Keeping current model. "
-                    f"Improvement: {comparison_results['absolute_improvement']:.4f} "
-                    f"(threshold: {self.improvement_threshold})"
                 )
                 logger.info(keep_msg)
                 return True, keep_msg
         except Exception as e:
-            error_msg = f"Model retraining failed: {str(e)}"
             logger.error(error_msg)
             return False, error_msg
 def main():
-    """Main execution function"""
     retrainer = RobustModelRetrainer()
     success, message = retrainer.retrain_model()

+# File: model/retrain.py (MODIFIED)
+# Enhanced version with comprehensive cross-validation for retraining
 import pandas as pd
 import numpy as np
 import joblib
     accuracy_score, precision_score, recall_score, f1_score,
     roc_auc_score, confusion_matrix, classification_report
 )
+from sklearn.model_selection import (
+    cross_val_score, StratifiedKFold, cross_validate, train_test_split
+)
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 )
 logger = logging.getLogger(__name__)
+class CVModelComparator:
+    """Advanced model comparison using cross-validation and statistical tests"""
+    def __init__(self, cv_folds: int = 5, random_state: int = 42):
+        self.cv_folds = cv_folds
+        self.random_state = random_state
+    def create_cv_strategy(self, X, y) -> StratifiedKFold:
+        """Create appropriate CV strategy based on data characteristics"""
+        n_samples = len(X)
+        min_samples_per_fold = 3
+        max_folds = n_samples // min_samples_per_fold
+        unique_classes = np.unique(y)
+        min_class_count = min([np.sum(y == cls) for cls in unique_classes])
+        max_folds_by_class = min_class_count
+        actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
+        logger.info(f"Using {actual_folds} CV folds for model comparison")
+        return StratifiedKFold(
+            n_splits=actual_folds,
+            shuffle=True,
+            random_state=self.random_state
+        )
+    def perform_model_cv_evaluation(self, model, X, y, cv_strategy=None) -> Dict:
+        """Perform comprehensive CV evaluation of a model"""
+        if cv_strategy is None:
+            cv_strategy = self.create_cv_strategy(X, y)
+        logger.info(f"Performing CV evaluation with {cv_strategy.n_splits} folds...")
+        scoring_metrics = {
+            'accuracy': 'accuracy',
+            'precision': 'precision_weighted',
+            'recall': 'recall_weighted',
+            'f1': 'f1_weighted',
+            'roc_auc': 'roc_auc'
+        }
+        try:
+            cv_scores = cross_validate(
+                model, X, y,
+                cv=cv_strategy,
+                scoring=scoring_metrics,
+                return_train_score=True,
+                n_jobs=1,
+                verbose=0
+            )
+            cv_results = {
+                'n_splits': cv_strategy.n_splits,
+                'test_scores': {},
+                'train_scores': {},
+                'fold_results': []
+            }
+            # Process results for each metric
+            for metric_name in scoring_metrics.keys():
+                test_key = f'test_{metric_name}'
+                train_key = f'train_{metric_name}'
+                if test_key in cv_scores:
+                    test_scores = cv_scores[test_key]
+                    cv_results['test_scores'][metric_name] = {
+                        'mean': float(np.mean(test_scores)),
+                        'std': float(np.std(test_scores)),
+                        'min': float(np.min(test_scores)),
+                        'max': float(np.max(test_scores)),
+                        'scores': test_scores.tolist()
+                    }
+                if train_key in cv_scores:
+                    train_scores = cv_scores[train_key]
+                    cv_results['train_scores'][metric_name] = {
+                        'mean': float(np.mean(train_scores)),
+                        'std': float(np.std(train_scores)),
+                        'scores': train_scores.tolist()
+                    }
+            # Individual fold results
+            for fold_idx in range(cv_strategy.n_splits):
+                fold_result = {
+                    'fold': fold_idx + 1,
+                    'test_scores': {},
+                    'train_scores': {}
+                }
+                for metric_name in scoring_metrics.keys():
+                    test_key = f'test_{metric_name}'
+                    train_key = f'train_{metric_name}'
+                    if test_key in cv_scores:
+                        fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
+                    if train_key in cv_scores:
+                        fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
+                cv_results['fold_results'].append(fold_result)
+            # Calculate overfitting and stability scores
+            if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
+                train_mean = cv_results['train_scores']['accuracy']['mean']
+                test_mean = cv_results['test_scores']['accuracy']['mean']
+                cv_results['overfitting_score'] = float(train_mean - test_mean)
+                test_std = cv_results['test_scores']['accuracy']['std']
+                cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
+            return cv_results
+        except Exception as e:
+            logger.error(f"CV evaluation failed: {e}")
+            return {'error': str(e), 'n_splits': cv_strategy.n_splits}
+    def compare_models_with_cv(self, model1, model2, X, y, model1_name="Production", model2_name="Candidate") -> Dict:
+        """Compare two models using cross-validation and statistical tests"""
+        logger.info(f"Comparing {model1_name} vs {model2_name} models using CV...")
+        try:
+            cv_strategy = self.create_cv_strategy(X, y)
+            # Evaluate both models with same CV folds
+            results1 = self.perform_model_cv_evaluation(model1, X, y, cv_strategy)
+            results2 = self.perform_model_cv_evaluation(model2, X, y, cv_strategy)
+            if 'error' in results1 or 'error' in results2:
+                return {
+                    'error': 'One or both models failed CV evaluation',
+                    'model1_results': results1,
+                    'model2_results': results2
+                }
+            # Statistical comparison
+            comparison_results = {
+                'model1_name': model1_name,
+                'model2_name': model2_name,
+                'cv_folds': cv_strategy.n_splits,
+                'model1_cv_results': results1,
+                'model2_cv_results': results2,
+                'statistical_tests': {},
+                'metric_comparisons': {}
+            }
+            # Compare each metric
+            for metric in ['accuracy', 'f1', 'precision', 'recall']:
+                if (metric in results1['test_scores'] and
+                    metric in results2['test_scores']):
+                    scores1 = results1['test_scores'][metric]['scores']
+                    scores2 = results2['test_scores'][metric]['scores']
+                    metric_comparison = self._compare_metric_scores(
+                        scores1, scores2, metric, model1_name, model2_name
+                    )
+                    comparison_results['metric_comparisons'][metric] = metric_comparison
+            # Overall recommendation
+            f1_comparison = comparison_results['metric_comparisons'].get('f1', {})
+            accuracy_comparison = comparison_results['metric_comparisons'].get('accuracy', {})
+            # Decision logic for model promotion
+            promote_candidate = False
+            promotion_reason = ""
+            if f1_comparison.get('significant_improvement', False):
+                promote_candidate = True
+                promotion_reason = f"Significant F1 improvement: {f1_comparison.get('improvement', 0):.4f}"
+            elif (f1_comparison.get('improvement', 0) > 0.01 and
+                  accuracy_comparison.get('improvement', 0) > 0.01):
+                promote_candidate = True
+                promotion_reason = "Practical improvement in both F1 and accuracy"
+            elif f1_comparison.get('improvement', 0) > 0.02:
+                promote_candidate = True
+                promotion_reason = f"Large F1 improvement: {f1_comparison.get('improvement', 0):.4f}"
+            else:
+                promotion_reason = "No significant improvement detected"
+            comparison_results['promotion_decision'] = {
+                'promote_candidate': promote_candidate,
+                'reason': promotion_reason,
+                'confidence': self._calculate_decision_confidence(comparison_results)
+            }
+            logger.info(f"Model comparison completed: {promotion_reason}")
+            return comparison_results
+        except Exception as e:
+            logger.error(f"Model comparison failed: {e}")
+            return {'error': str(e)}
+    def _compare_metric_scores(self, scores1: list, scores2: list, metric: str,
+                              model1_name: str, model2_name: str) -> Dict:
+        """Compare metric scores between two models using statistical tests"""
+        try:
+            # Basic statistics
+            mean1, mean2 = np.mean(scores1), np.mean(scores2)
+            std1, std2 = np.std(scores1), np.std(scores2)
+            improvement = mean2 - mean1
+            comparison = {
+                'metric': metric,
+                f'{model1_name.lower()}_mean': float(mean1),
+                f'{model2_name.lower()}_mean': float(mean2),
+                f'{model1_name.lower()}_std': float(std1),
+                f'{model2_name.lower()}_std': float(std2),
+                'improvement': float(improvement),
+                'relative_improvement': float(improvement / mean1 * 100) if mean1 > 0 else 0,
+                'tests': {}
+            }
+            # Paired t-test
+            try:
+                t_stat, p_value = stats.ttest_rel(scores2, scores1)
+                comparison['tests']['paired_ttest'] = {
+                    't_statistic': float(t_stat),
+                    'p_value': float(p_value),
+                    'significant': p_value < 0.05
+                }
+            except Exception as e:
+                logger.warning(f"Paired t-test failed for {metric}: {e}")
+            # Wilcoxon signed-rank test (non-parametric alternative)
+            try:
+                w_stat, w_p_value = stats.wilcoxon(scores2, scores1, alternative='greater')
+                comparison['tests']['wilcoxon'] = {
+                    'statistic': float(w_stat),
+                    'p_value': float(w_p_value),
+                    'significant': w_p_value < 0.05
+                }
+            except Exception as e:
+                logger.warning(f"Wilcoxon test failed for {metric}: {e}")
+            # Effect size (Cohen's d)
+            try:
+                pooled_std = np.sqrt(((len(scores1) - 1) * std1**2 + (len(scores2) - 1) * std2**2) /
+                                   (len(scores1) + len(scores2) - 2))
+                cohens_d = improvement / pooled_std if pooled_std > 0 else 0
+                comparison['effect_size'] = float(cohens_d)
+            except Exception:
+                comparison['effect_size'] = 0
+            # Practical significance
+            practical_threshold = 0.01  # 1% improvement threshold
+            comparison['practical_significance'] = abs(improvement) > practical_threshold
+            comparison['significant_improvement'] = (
+                improvement > practical_threshold and
+                comparison['tests'].get('paired_ttest', {}).get('significant', False)
+            )
+            return comparison
+        except Exception as e:
+            logger.error(f"Metric comparison failed for {metric}: {e}")
+            return {'metric': metric, 'error': str(e)}
+    def _calculate_decision_confidence(self, comparison_results: Dict) -> float:
+        """Calculate confidence in the promotion decision"""
+        try:
+            confidence_factors = []
+            # Check F1 improvement significance
+            f1_comp = comparison_results['metric_comparisons'].get('f1', {})
+            if f1_comp.get('significant_improvement', False):
+                confidence_factors.append(0.4)
+            elif f1_comp.get('improvement', 0) > 0.01:
+                confidence_factors.append(0.2)
+            # Check consistency across metrics
+            improved_metrics = 0
+            total_metrics = 0
+            for metric_comp in comparison_results['metric_comparisons'].values():
+                if isinstance(metric_comp, dict) and 'improvement' in metric_comp:
+                    total_metrics += 1
+                    if metric_comp['improvement'] > 0:
+                        improved_metrics += 1
+            if total_metrics > 0:
+                consistency_score = improved_metrics / total_metrics
+                confidence_factors.append(consistency_score * 0.3)
+            # Check effect sizes
+            effect_sizes = []
+            for metric_comp in comparison_results['metric_comparisons'].values():
+                if isinstance(metric_comp, dict) and 'effect_size' in metric_comp:
+                    effect_sizes.append(abs(metric_comp['effect_size']))
+            if effect_sizes:
+                avg_effect_size = np.mean(effect_sizes)
+                if avg_effect_size > 0.5:  # Large effect
+                    confidence_factors.append(0.2)
+                elif avg_effect_size > 0.2:  # Medium effect
+                    confidence_factors.append(0.1)
+            # Calculate final confidence
+            total_confidence = sum(confidence_factors)
+            return min(1.0, max(0.0, total_confidence))
+        except Exception as e:
+            logger.warning(f"Confidence calculation failed: {e}")
+            return 0.5
 class RobustModelRetrainer:
+    """Production-ready model retraining with comprehensive CV and statistical validation"""
     def __init__(self):
         self.setup_paths()
         self.setup_retraining_config()
         self.setup_statistical_tests()
+        self.cv_comparator = CVModelComparator()
     def setup_paths(self):
         """Setup all necessary paths"""
         self.min_new_samples = 50
         self.improvement_threshold = 0.01  # 1% improvement required
         self.significance_level = 0.05
+        self.cv_folds = 5  # Increased for better validation
         self.test_size = 0.2
         self.random_state = 42
         self.max_retries = 3
     def setup_statistical_tests(self):
         """Setup statistical test configurations"""
         self.statistical_tests = {
             'paired_ttest': {'alpha': 0.05, 'name': "Paired T-Test"},
+            'wilcoxon': {'alpha': 0.05, 'name': "Wilcoxon Signed-Rank Test"},
+            'mcnemar': {'alpha': 0.05, 'name': "McNemar's Test"}
         }
     def load_existing_metadata(self) -> Optional[Dict]:
         return pipeline
     def train_candidate_model(self, df: pd.DataFrame) -> Tuple[bool, Optional[Any], Dict]:
+        """Train candidate model with comprehensive CV evaluation"""
         try:
+            logger.info("Training candidate model with cross-validation...")
             # Prepare data
             X = df['text'].values
             y = df['label'].values
             # Create and train pipeline
             pipeline = self.create_advanced_pipeline()
+            # Perform cross-validation before final training
+            logger.info("Performing cross-validation on candidate model...")
+            cv_results = self.cv_comparator.perform_model_cv_evaluation(pipeline, X, y)
+            # Train on full dataset for final model
+            pipeline.fit(X, y)
+            # Additional holdout evaluation
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
+            )
+            pipeline_holdout = self.create_advanced_pipeline()
+            pipeline_holdout.fit(X_train, y_train)
+            # Evaluate on holdout
+            y_pred = pipeline_holdout.predict(X_test)
+            y_pred_proba = pipeline_holdout.predict_proba(X_test)[:, 1]
+            holdout_metrics = {
                 'accuracy': float(accuracy_score(y_test, y_pred)),
                 'precision': float(precision_score(y_test, y_pred, average='weighted')),
                 'recall': float(recall_score(y_test, y_pred, average='weighted')),
                 'f1': float(f1_score(y_test, y_pred, average='weighted')),
+                'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
             }
+            # Combine CV and holdout results
+            evaluation_results = {
+                'cross_validation': cv_results,
+                'holdout_evaluation': holdout_metrics,
+                'training_samples': len(X),
+                'test_samples': len(X_test)
+            }
+            # Save candidate model
+            joblib.dump(pipeline, self.candidate_pipeline_path)
+            if hasattr(pipeline, 'named_steps'):
+                joblib.dump(pipeline.named_steps['model'], self.candidate_model_path)
+                joblib.dump(pipeline.named_steps['vectorize'], self.candidate_vectorizer_path)
+            # Log results
+            if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
+                cv_f1_mean = cv_results['test_scores']['f1']['mean']
+                cv_f1_std = cv_results['test_scores']['f1']['std']
+                logger.info(f"Candidate model CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
+            logger.info(f"Candidate model holdout F1: {holdout_metrics['f1']:.4f}")
+            logger.info(f"Candidate model training completed")
+            return True, pipeline, evaluation_results
         except Exception as e:
+            error_msg = f"Candidate model training failed: {str(e)}"
+            logger.error(error_msg)
+            return False, None, {'error': error_msg}
+    def compare_models_with_cv_validation(self, prod_model, candidate_model, X, y) -> Dict:
+        """Compare models using comprehensive cross-validation"""
+        logger.info("Performing comprehensive model comparison with CV...")
         try:
+            # Use the CV comparator for detailed analysis
+            comparison_results = self.cv_comparator.compare_models_with_cv(
+                prod_model, candidate_model, X, y, "Production", "Candidate"
+            )
+            if 'error' in comparison_results:
+                return comparison_results
+            # Additional legacy format for backward compatibility
+            legacy_comparison = {
+                'production_cv_results': comparison_results['model1_cv_results'],
+                'candidate_cv_results': comparison_results['model2_cv_results'],
+                'statistical_tests': comparison_results['statistical_tests'],
+                'promotion_decision': comparison_results['promotion_decision']
             }
+            # Extract key metrics for legacy format
+            prod_cv = comparison_results['model1_cv_results']
+            cand_cv = comparison_results['model2_cv_results']
+            if 'test_scores' in prod_cv and 'test_scores' in cand_cv:
+                if 'accuracy' in prod_cv['test_scores'] and 'accuracy' in cand_cv['test_scores']:
+                    legacy_comparison.update({
+                        'production_accuracy': prod_cv['test_scores']['accuracy']['mean'],
+                        'candidate_accuracy': cand_cv['test_scores']['accuracy']['mean'],
+                        'absolute_improvement': (cand_cv['test_scores']['accuracy']['mean'] -
+                                               prod_cv['test_scores']['accuracy']['mean']),
+                        'relative_improvement': ((cand_cv['test_scores']['accuracy']['mean'] -
+                                                prod_cv['test_scores']['accuracy']['mean']) /
+                                               prod_cv['test_scores']['accuracy']['mean'] * 100)
+                    })
+            # Merge detailed and legacy formats
+            final_results = {**comparison_results, **legacy_comparison}
+            # Log summary
+            f1_comp = comparison_results.get('metric_comparisons', {}).get('f1', {})
+            if f1_comp:
+                logger.info(f"F1 improvement: {f1_comp.get('improvement', 0):.4f}")
+                logger.info(f"Significant improvement: {f1_comp.get('significant_improvement', False)}")
+            promotion_decision = comparison_results.get('promotion_decision', {})
+            logger.info(f"Promotion recommendation: {promotion_decision.get('promote_candidate', False)}")
+            logger.info(f"Reason: {promotion_decision.get('reason', 'Unknown')}")
+            return final_results
         except Exception as e:
+            logger.error(f"Model comparison failed: {str(e)}")
             return {'error': str(e)}
     def create_backup(self) -> bool:
             return False
     def promote_candidate_model(self, candidate_model, candidate_metrics: Dict, comparison_results: Dict) -> bool:
+        """Promote candidate model to production with enhanced metadata"""
         try:
             logger.info("Promoting candidate model to production...")
             shutil.copy2(self.candidate_vectorizer_path, self.prod_vectorizer_path)
             shutil.copy2(self.candidate_pipeline_path, self.prod_pipeline_path)
+            # Update metadata with comprehensive CV information
             metadata = self.load_existing_metadata() or {}
             # Increment version
             else:
                 new_version = f"v1.{int(datetime.now().timestamp()) % 1000}"
+            # Extract metrics from candidate evaluation
+            cv_results = candidate_metrics.get('cross_validation', {})
+            holdout_results = candidate_metrics.get('holdout_evaluation', {})
+            # Update metadata with comprehensive information
             metadata.update({
                 'model_version': new_version,
+                'model_type': 'retrained_pipeline_cv',
                 'previous_version': old_version,
                 'promotion_timestamp': datetime.now().isoformat(),
+                'retrain_trigger': 'cv_validated_retrain',
+                'training_samples': candidate_metrics.get('training_samples', 'Unknown'),
+                'test_samples': candidate_metrics.get('test_samples', 'Unknown')
             })
+            # Add holdout evaluation results
+            if holdout_results:
+                metadata.update({
+                    'test_accuracy': holdout_results.get('accuracy', 'Unknown'),
+                    'test_f1': holdout_results.get('f1', 'Unknown'),
+                    'test_precision': holdout_results.get('precision', 'Unknown'),
+                    'test_recall': holdout_results.get('recall', 'Unknown'),
+                    'test_roc_auc': holdout_results.get('roc_auc', 'Unknown')
+                })
+            # Add comprehensive CV results
+            if cv_results and 'test_scores' in cv_results:
+                metadata['cross_validation'] = {
+                    'n_splits': cv_results.get('n_splits', self.cv_folds),
+                    'test_scores': cv_results['test_scores'],
+                    'train_scores': cv_results.get('train_scores', {}),
+                    'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
+                    'stability_score': cv_results.get('stability_score', 'Unknown'),
+                    'individual_fold_results': cv_results.get('fold_results', [])
+                }
+                # Add CV summary statistics
+                if 'f1' in cv_results['test_scores']:
+                    metadata.update({
+                        'cv_f1_mean': cv_results['test_scores']['f1']['mean'],
+                        'cv_f1_std': cv_results['test_scores']['f1']['std'],
+                        'cv_f1_min': cv_results['test_scores']['f1']['min'],
+                        'cv_f1_max': cv_results['test_scores']['f1']['max']
+                    })
+            # Add model comparison results
+            promotion_decision = comparison_results.get('promotion_decision', {})
+            metadata['promotion_validation'] = {
+                'decision_confidence': promotion_decision.get('confidence', 'Unknown'),
+                'promotion_reason': promotion_decision.get('reason', 'Unknown'),
+                'comparison_method': 'cross_validation_statistical_tests'
+            }
+            # Add statistical test results
+            metric_comparisons = comparison_results.get('metric_comparisons', {})
+            if metric_comparisons:
+                metadata['statistical_validation'] = {}
+                for metric, comparison in metric_comparisons.items():
+                    if isinstance(comparison, dict):
+                        metadata['statistical_validation'][metric] = {
+                            'improvement': comparison.get('improvement', 0),
+                            'significant_improvement': comparison.get('significant_improvement', False),
+                            'effect_size': comparison.get('effect_size', 0),
+                            'tests': comparison.get('tests', {})
+                        }
             # Save updated metadata
             with open(self.metadata_path, 'w') as f:
                 json.dump(metadata, f, indent=2)
             logger.info(f"Model promoted successfully to {new_version}")
+            logger.info(f"Promotion reason: {promotion_decision.get('reason', 'CV validation passed')}")
             return True
         except Exception as e:
             return False
     def log_retraining_session(self, results: Dict):
+        """Log comprehensive retraining session results"""
         try:
             log_entry = {
                 'timestamp': datetime.now().isoformat(),
                 'results': results,
+                'session_id': hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8],
+                'retraining_type': 'cv_enhanced'
             }
             # Load existing logs
             with open(self.retraining_log_path, 'w') as f:
                 json.dump(logs, f, indent=2)
+            # Also save detailed comparison results
+            if 'comparison_results' in results:
+                comparison_logs = []
+                if self.comparison_log_path.exists():
+                    try:
+                        with open(self.comparison_log_path, 'r') as f:
+                            comparison_logs = json.load(f)
+                    except:
+                        comparison_logs = []
+                comparison_entry = {
+                    'timestamp': datetime.now().isoformat(),
+                    'session_id': log_entry['session_id'],
+                    'comparison_details': results['comparison_results']
+                }
+                comparison_logs.append(comparison_entry)
+                if len(comparison_logs) > 50:
+                    comparison_logs = comparison_logs[-50:]
+                with open(self.comparison_log_path, 'w') as f:
+                    json.dump(comparison_logs, f, indent=2)
         except Exception as e:
             logger.error(f"Failed to log retraining session: {str(e)}")
     def retrain_model(self) -> Tuple[bool, str]:
+        """Main retraining function with comprehensive CV validation"""
         try:
+            logger.info("Starting enhanced model retraining with cross-validation...")
             # Load existing metadata
             existing_metadata = self.load_existing_metadata()
             if len(df) < self.min_new_samples:
                 return False, f"Insufficient new data: {len(df)} < {self.min_new_samples}"
+            # Train candidate model with CV
             candidate_success, candidate_model, candidate_metrics = self.train_candidate_model(df)
             if not candidate_success:
                 return False, f"Candidate training failed: {candidate_metrics.get('error', 'Unknown error')}"
+            # Prepare data for model comparison
             X = df['text'].values
             y = df['label'].values
+            # Comprehensive model comparison with CV
+            comparison_results = self.compare_models_with_cv_validation(
+                prod_model, candidate_model, X, y
             )
             # Log results
                 'candidate_metrics': candidate_metrics,
                 'comparison_results': comparison_results,
                 'data_size': len(df),
+                'cv_folds': self.cv_folds,
+                'retraining_method': 'cv_enhanced'
             }
             self.log_retraining_session(session_results)
+            # Decision based on CV comparison
+            promotion_decision = comparison_results.get('promotion_decision', {})
+            should_promote = promotion_decision.get('promote_candidate', False)
             if should_promote:
                 # Promote candidate model
                 )
                 if promotion_success:
+                    # Extract improvement information
+                    f1_comp = comparison_results.get('metric_comparisons', {}).get('f1', {})
+                    improvement = f1_comp.get('improvement', 0)
+                    confidence = promotion_decision.get('confidence', 0)
                     success_msg = (
+                        f"Model promoted successfully with CV validation! "
+                        f"F1 improvement: {improvement:.4f}, "
+                        f"Confidence: {confidence:.2f}, "
+                        f"Reason: {promotion_decision.get('reason', 'CV validation passed')}"
                     )
                     logger.info(success_msg)
                     return True, success_msg
                     return False, "Model promotion failed"
             else:
                 # Keep current model
+                reason = promotion_decision.get('reason', 'No significant improvement detected')
+                confidence = promotion_decision.get('confidence', 0)
                 keep_msg = (
+                    f"Keeping current model based on CV analysis. "
+                    f"Reason: {reason}, "
+                    f"Confidence: {confidence:.2f}"
                 )
                 logger.info(keep_msg)
                 return True, keep_msg
         except Exception as e:
+            error_msg = f"Enhanced model retraining failed: {str(e)}"
             logger.error(error_msg)
             return False, error_msg
 def main():
+    """Main execution function with CV enhancements"""
     retrainer = RobustModelRetrainer()
     success, message = retrainer.retrain_model()