Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

52d71b1

1 Parent(s): dce7bfe

Update model/train.py

Browse files

Restored previous working version

Files changed (1) hide show

model/train.py +258 -629

model/train.py CHANGED Viewed

@@ -27,161 +27,46 @@ from typing import Dict, Tuple, Optional, Any
 import warnings
 warnings.filterwarnings('ignore')
-# =============================================================================
-# CENTRALIZED PATH CONFIGURATION - MATCHES FASTAPI SERVER
-# =============================================================================
-class PathConfig:
-    """Centralized path management to ensure consistency across all components"""
-    # Base directories
-    BASE_DIR = Path("/tmp")
-    DATA_DIR = BASE_DIR / "data"
-    MODEL_DIR = BASE_DIR / "model"  # CONSISTENT: /tmp/model/
-    LOGS_DIR = BASE_DIR / "logs"
-    RESULTS_DIR = BASE_DIR / "results"
-    # Model files - CONSISTENT PATHS (matches fastapi_server.py)
-    MODEL_FILE = MODEL_DIR / "model.pkl"           # /tmp/model/model.pkl
-    VECTORIZER_FILE = MODEL_DIR / "vectorizer.pkl" # /tmp/model/vectorizer.pkl
-    PIPELINE_FILE = MODEL_DIR / "pipeline.pkl"     # /tmp/model/pipeline.pkl
-    METADATA_FILE = BASE_DIR / "metadata.json"     # /tmp/metadata.json
-    # Data files
-    COMBINED_DATASET = DATA_DIR / "combined_dataset.csv"
-    SCRAPED_DATA = DATA_DIR / "scraped_real.csv"
-    GENERATED_DATA = DATA_DIR / "generated_fake.csv"
-    # Log and result files
-    TRAINING_LOG = LOGS_DIR / "model_training.log"
-    EVALUATION_RESULTS = RESULTS_DIR / "evaluation_results.json"
-    @classmethod
-    def ensure_directories(cls):
-        """Create all required directories with proper permissions"""
-        for attr_name in dir(cls):
-            attr = getattr(cls, attr_name)
-            if isinstance(attr, Path) and attr_name.endswith('_DIR'):
-                attr.mkdir(parents=True, exist_ok=True, mode=0o755)
-        # Additional directory creation for safety
-        for directory in [cls.BASE_DIR, cls.DATA_DIR, cls.MODEL_DIR, cls.LOGS_DIR, cls.RESULTS_DIR]:
-            directory.mkdir(parents=True, exist_ok=True, mode=0o755)
-# Initialize directories at startup
-PathConfig.ensure_directories()
-# =============================================================================
-# ENHANCED LOGGING CONFIGURATION
-# =============================================================================
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
     handlers=[
-        logging.FileHandler(PathConfig.TRAINING_LOG),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
-# =============================================================================
-# DATA VALIDATION PIPELINE
-# =============================================================================
-class DataValidator:
-    """Comprehensive data validation for training pipeline"""
-    def __init__(self, min_text_length: int = 10, max_null_ratio: float = 0.1):
-        self.min_text_length = min_text_length
-        self.max_null_ratio = max_null_ratio
-    def validate_schema(self, df: pd.DataFrame) -> Tuple[bool, list]:
-        """Validate data schema"""
-        errors = []
-        required_columns = ['text', 'label']
-        missing_cols = set(required_columns) - set(df.columns)
-        if missing_cols:
-            errors.append(f"Missing required columns: {missing_cols}")
-        return len(errors) == 0, errors
-    def validate_quality(self, df: pd.DataFrame) -> Tuple[bool, list]:
-        """Validate data quality"""
-        errors = []
-        # Check null ratio
-        null_ratio = df.isnull().sum().sum() / (len(df) * len(df.columns))
-        if null_ratio > self.max_null_ratio:
-            errors.append(f"Too many nulls: {null_ratio:.2%} > {self.max_null_ratio:.2%}")
-        # Check text quality
-        if 'text' in df.columns:
-            short_texts = (df['text'].astype(str).str.len() < self.min_text_length).sum()
-            if short_texts > 0:
-                errors.append(f"{short_texts} texts below minimum length ({self.min_text_length} chars)")
-        # Check minimum samples
-        if len(df) < 100:
-            errors.append(f"Insufficient samples for training: {len(df)} < 100")
-        # Check class distribution
-        if 'label' in df.columns:
-            unique_labels = df['label'].unique()
-            if len(unique_labels) < 2:
-                errors.append(f"Need at least 2 classes, found: {unique_labels}")
-            label_counts = df['label'].value_counts()
-            min_class_ratio = label_counts.min() / label_counts.max()
-            if min_class_ratio < 0.05:
-                errors.append(f"Severe class imbalance: {min_class_ratio:.3f}")
-            elif min_class_ratio < 0.1:
-                logger.warning(f"Class imbalance detected: {min_class_ratio:.3f}")
-        return len(errors) == 0, errors
-    def validate(self, df: pd.DataFrame) -> Tuple[bool, Dict[str, list]]:
-        """Complete data validation"""
-        all_valid = True
-        all_errors = {}
-        # Schema validation
-        schema_valid, schema_errors = self.validate_schema(df)
-        if not schema_valid:
-            all_valid = False
-            all_errors['schema'] = schema_errors
-        # Quality validation
-        quality_valid, quality_errors = self.validate_quality(df)
-        if not quality_valid:
-            all_valid = False
-            all_errors['quality'] = quality_errors
-        return all_valid, all_errors
-# =============================================================================
-# ENHANCED MODEL TRAINER WITH FIXED PATHS
-# =============================================================================
 class RobustModelTrainer:
-    """Production-ready model trainer with comprehensive evaluation and FIXED PATH MANAGEMENT"""
     def __init__(self):
-        # Use centralized path configuration
-        PathConfig.ensure_directories()
         self.setup_training_config()
         self.setup_models()
-        self.data_validator = DataValidator()
-        # Log path configuration for verification
-        logger.info("🔧 Path Configuration:")
-        logger.info(f"Model Directory: {PathConfig.MODEL_DIR}")
-        logger.info(f"Pipeline File: {PathConfig.PIPELINE_FILE}")
-        logger.info(f"Model File: {PathConfig.MODEL_FILE}")
-        logger.info(f"Vectorizer File: {PathConfig.VECTORIZER_FILE}")
-        logger.info(f"Metadata File: {PathConfig.METADATA_FILE}")
     def setup_training_config(self):
         """Setup training configuration"""
@@ -227,47 +112,57 @@ class RobustModelTrainer:
         }
     def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
-        """Load and validate training data with enhanced validation"""
         try:
-            logger.info("Loading and validating training data...")
-            data_path = PathConfig.COMBINED_DATASET
-            if not data_path.exists():
-                return False, None, f"Data file not found: {data_path}"
             # Load data
-            df = pd.read_csv(data_path)
-            logger.info(f"Loaded dataset with {len(df)} samples")
-            # Enhanced validation using DataValidator
-            valid, validation_errors = self.data_validator.validate(df)
-            if not valid:
-                error_msg = "Data validation failed:\n" + "\n".join([
-                    f"  {category}: {errors}" for category, errors in validation_errors.items()
-                ])
-                logger.error(error_msg)
-                return False, None, error_msg
-            # Clean data
-            initial_count = len(df)
             # Remove missing values
-            df = df.dropna(subset=['text', 'label'])
-            # Remove short texts
-            df = df[df['text'].astype(str).str.len() >= self.data_validator.min_text_length]
             if len(df) < initial_count:
-                logger.info(f"🧹 Cleaned data: removed {initial_count - len(df)} invalid samples")
-            # Log final statistics
             label_counts = df['label'].value_counts()
-            logger.info(f"Data validation successful: {len(df)} samples")
             logger.info(f"Class distribution: {label_counts.to_dict()}")
-            return True, df, "Data loaded and validated successfully"
         except Exception as e:
             error_msg = f"Error loading data: {str(e)}"
@@ -275,40 +170,33 @@ class RobustModelTrainer:
             return False, None, error_msg
     def preprocess_text(self, text):
-        """Advanced text preprocessing with better error handling"""
         import re
-        try:
-            # Convert to string
-            text = str(text)
-            # Remove URLs
-            text = re.sub(r'http\S+|www\S+|https\S+', '', text)
-            # Remove email addresses
-            text = re.sub(r'\S+@\S+', '', text)
-            # Remove excessive punctuation
-            text = re.sub(r'[!]{2,}', '!', text)
-            text = re.sub(r'[?]{2,}', '?', text)
-            text = re.sub(r'[.]{3,}', '...', text)
-            # Remove non-alphabetic characters except spaces and basic punctuation
-            text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
-            # Remove excessive whitespace
-            text = re.sub(r'\s+', ' ', text)
-            return text.strip().lower()
-        except Exception as e:
-            logger.warning(f"Text preprocessing failed for text, returning original: {e}")
-            return str(text).lower()
     def create_preprocessing_pipeline(self) -> Pipeline:
-        """Create advanced preprocessing pipeline with FIXED saving"""
-        logger.info("🔧 Creating preprocessing pipeline...")
         # Text preprocessing
         text_preprocessor = FunctionTransformer(
             func=lambda x: [self.preprocess_text(text) for text in x],
@@ -340,197 +228,95 @@ class RobustModelTrainer:
             ('model', None)  # Will be set during training
         ])
-        logger.info("Preprocessing pipeline created successfully")
-        return pipeline
-    def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
-        """Save model artifacts with FIXED PATHS and comprehensive error handling"""
-        try:
-            logger.info("💾 Saving model artifacts with corrected paths...")
-            # FIXED: Use centralized path configuration
-            pipeline_path = PathConfig.PIPELINE_FILE     # /tmp/model/pipeline.pkl
-            model_path = PathConfig.MODEL_FILE           # /tmp/model/model.pkl
-            vectorizer_path = PathConfig.VECTORIZER_FILE # /tmp/model/vectorizer.pkl
-            metadata_path = PathConfig.METADATA_FILE     # /tmp/metadata.json
-            logger.info(f"Saving to paths:")
-            logger.info(f"  Pipeline: {pipeline_path}")
-            logger.info(f"  Model: {model_path}")
-            logger.info(f"  Vectorizer: {vectorizer_path}")
-            logger.info(f"  Metadata: {metadata_path}")
-            # Save the complete pipeline (FIXED PATH)
-            joblib.dump(model, pipeline_path)
-            logger.info("Saved complete pipeline")
-            # Save individual components for backward compatibility (FIXED PATHS)
-            try:
-                if hasattr(model, 'named_steps'):
-                    # Save individual model
-                    if 'model' in model.named_steps and model.named_steps['model'] is not None:
-                        joblib.dump(model.named_steps['model'], model_path)
-                        logger.info("Saved individual model component")
-                    # Save individual vectorizer
-                    if 'vectorize' in model.named_steps and model.named_steps['vectorize'] is not None:
-                        joblib.dump(model.named_steps['vectorize'], vectorizer_path)
-                        logger.info("Saved individual vectorizer component")
-                else:
-                    logger.warning("Model doesn't have named_steps, skipping individual component saves")
-            except Exception as e:
-                logger.warning(f"Could not save individual components: {e}")
-            # Generate comprehensive metadata
-            metadata = self.generate_metadata(model_name, metrics)
-            # Save metadata (FIXED PATH)
-            with open(metadata_path, 'w') as f:
-                json.dump(metadata, f, indent=2)
-            logger.info("Saved model metadata")
-            # Verify all files were created
-            verification_results = {
-                'pipeline': pipeline_path.exists(),
-                'model': model_path.exists(),
-                'vectorizer': vectorizer_path.exists(),
-                'metadata': metadata_path.exists()
-            }
-            logger.info("🔍 File verification results:")
-            for file_type, exists in verification_results.items():
-                status = "✅" if exists else "❌"
-                logger.info(f"  {status} {file_type}: {exists}")
-            # Check if at least the pipeline was saved
-            if not verification_results['pipeline']:
-                raise Exception("Critical: Pipeline file was not created")
-            logger.info("🎉 Model artifacts saved successfully!")
-            return True
-        except Exception as e:
-            logger.error(f"❌ Failed to save model artifacts: {str(e)}")
-            return False
-    def generate_metadata(self, model_name: str, metrics: Dict) -> Dict:
-        """Generate comprehensive metadata"""
-        # Generate data hash for versioning
-        data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8]
-        metadata = {
-            'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
-            'model_type': model_name,
-            'data_version': data_hash,
-            'training_metrics': {
-                'test_accuracy': metrics.get('accuracy', 'Unknown'),
-                'test_f1': metrics.get('f1', 'Unknown'),
-                'test_precision': metrics.get('precision', 'Unknown'),
-                'test_recall': metrics.get('recall', 'Unknown'),
-                'test_roc_auc': metrics.get('roc_auc', 'Unknown'),
-                'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
-                'cv_score_mean': metrics.get('cv_scores', {}).get('mean', 'Unknown'),
-                'cv_score_std': metrics.get('cv_scores', {}).get('std', 'Unknown')
-            },
-            'training_config': {
-                'test_size': self.test_size,
-                'validation_size': self.validation_size,
-                'cv_folds': self.cv_folds,
-                'max_features': self.max_features,
-                'ngram_range': self.ngram_range,
-                'feature_selection_k': self.feature_selection_k,
-                'class_weight': self.class_weight
-            },
-            'paths': {
-                'pipeline_file': str(PathConfig.PIPELINE_FILE),
-                'model_file': str(PathConfig.MODEL_FILE),
-                'vectorizer_file': str(PathConfig.VECTORIZER_FILE)
-            },
-            'timestamp': datetime.now().isoformat(),
-            'training_completed': True
-        }
-        return metadata
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
         """Comprehensive model evaluation with multiple metrics"""
         logger.info("Starting comprehensive model evaluation...")
-        try:
-            # Predictions
-            y_pred = model.predict(X_test)
-            y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
-            # Basic metrics
-            metrics = {
-                'accuracy': float(accuracy_score(y_test, y_pred)),
-                'precision': float(precision_score(y_test, y_pred, average='weighted', zero_division=0)),
-                'recall': float(recall_score(y_test, y_pred, average='weighted', zero_division=0)),
-                'f1': float(f1_score(y_test, y_pred, average='weighted', zero_division=0))
-            }
-            # ROC AUC if probabilities available
-            if y_pred_proba is not None:
-                try:
-                    metrics['roc_auc'] = float(roc_auc_score(y_test, y_pred_proba))
-                except Exception as e:
-                    logger.warning(f"Could not calculate ROC AUC: {e}")
-                    metrics['roc_auc'] = 0.0
-            else:
-                metrics['roc_auc'] = 0.0
-            # Confusion matrix
-            cm = confusion_matrix(y_test, y_pred)
-            metrics['confusion_matrix'] = cm.tolist()
-            # Classification report
             try:
-                class_report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
-                metrics['classification_report'] = class_report
             except Exception as e:
-                logger.warning(f"Could not generate classification report: {e}")
-            # Cross-validation scores if training data provided
-            if X_train is not None and y_train is not None:
-                try:
-                    cv_scores = cross_val_score(
-                        model, X_train, y_train,
-                        cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
-                        scoring='f1_weighted'
-                    )
-                    metrics['cv_scores'] = {
-                        'mean': float(cv_scores.mean()),
-                        'std': float(cv_scores.std()),
-                        'scores': cv_scores.tolist()
-                    }
-                except Exception as e:
-                    logger.warning(f"Cross-validation failed: {e}")
-                    metrics['cv_scores'] = {'mean': 0.0, 'std': 0.0, 'scores': []}
             # Training accuracy for overfitting detection
             if X_train is not None and y_train is not None:
-                try:
-                    y_train_pred = model.predict(X_train)
-                    train_accuracy = accuracy_score(y_train, y_train_pred)
-                    metrics['train_accuracy'] = float(train_accuracy)
-                    metrics['overfitting_score'] = float(train_accuracy - metrics['accuracy'])
-                except Exception as e:
-                    logger.warning(f"Overfitting detection failed: {e}")
-            logger.info(f"📈 Evaluation completed - F1: {metrics['f1']:.4f}, Accuracy: {metrics['accuracy']:.4f}")
-            return metrics
         except Exception as e:
-            logger.error(f"❌ Evaluation failed: {e}")
-            return {
-                'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0,
-                'f1': 0.0, 'roc_auc': 0.0, 'error': str(e)
-            }
     def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
         """Perform hyperparameter tuning with cross-validation"""
-        logger.info(f"🔧 Starting hyperparameter tuning for {model_name}...")
         try:
             # Set the model in the pipeline
@@ -543,7 +329,8 @@ class RobustModelTrainer:
             grid_search = GridSearchCV(
                 pipeline,
                 param_grid,
-                cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
                 n_jobs=-1,
                 verbose=1
@@ -560,7 +347,7 @@ class RobustModelTrainer:
                 'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
-                    'params': [dict(p) for p in grid_search.cv_results_['params']]
                 }
             }
@@ -571,19 +358,16 @@ class RobustModelTrainer:
             return grid_search.best_estimator_, tuning_results
         except Exception as e:
-            logger.error(f"❌ Hyperparameter tuning failed for {model_name}: {str(e)}")
             # Return basic model if tuning fails
-            try:
-                pipeline.set_params(model=self.models[model_name]['model'])
-                pipeline.fit(X_train, y_train)
-                return pipeline, {'error': str(e), 'used_default_params': True}
-            except Exception as e2:
-                logger.error(f"❌ Even basic model training failed: {str(e2)}")
-                raise e2
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
-        logger.info("🚀 Starting model training and evaluation...")
         results = {}
@@ -591,7 +375,7 @@ class RobustModelTrainer:
             logger.info(f"Training {model_name}...")
             try:
-                # Create fresh pipeline for each model
                 pipeline = self.create_preprocessing_pipeline()
                 # Hyperparameter tuning
@@ -612,18 +396,18 @@ class RobustModelTrainer:
                     'training_time': datetime.now().isoformat()
                 }
-                logger.info(f"✅ Model {model_name} - F1: {evaluation_metrics['f1']:.4f}, "
                             f"Accuracy: {evaluation_metrics['accuracy']:.4f}")
             except Exception as e:
-                logger.error(f"❌ Training failed for {model_name}: {str(e)}")
                 results[model_name] = {'error': str(e)}
         return results
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
         """Select the best performing model"""
-        logger.info("🏆 Selecting best model...")
         best_model_name = None
         best_model = None
@@ -632,7 +416,6 @@ class RobustModelTrainer:
         for model_name, result in results.items():
             if 'error' in result:
-                logger.warning(f"Skipping {model_name} due to error: {result['error']}")
                 continue
             # Use F1 score as primary metric
@@ -645,11 +428,69 @@ class RobustModelTrainer:
                 best_metrics = result['evaluation_metrics']
         if best_model_name is None:
-            raise ValueError("❌ No models trained successfully")
-        logger.info(f"🏆 Best model: {best_model_name} with F1 score: {best_score:.4f}")
         return best_model_name, best_model, best_metrics
     def save_evaluation_results(self, results: Dict) -> bool:
         """Save comprehensive evaluation results"""
         try:
@@ -662,32 +503,31 @@ class RobustModelTrainer:
                     clean_results[model_name] = {
                         'tuning_results': {
                             k: v for k, v in result['tuning_results'].items()
-                            if k != 'best_estimator'  # Can't serialize sklearn objects
                         },
                         'evaluation_metrics': result['evaluation_metrics'],
                         'training_time': result['training_time']
                     }
-            # Save results to centralized path
-            evaluation_path = PathConfig.EVALUATION_RESULTS
-            with open(evaluation_path, 'w') as f:
                 json.dump(clean_results, f, indent=2, default=str)
-            logger.info(f"📊 Evaluation results saved to {evaluation_path}")
             return True
         except Exception as e:
-            logger.error(f"❌ Failed to save evaluation results: {str(e)}")
             return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
         try:
-            logger.info("🚀 Starting model training pipeline...")
-            # Log system information
-            logger.info(f"Training environment: {PathConfig.BASE_DIR}")
-            PathConfig.ensure_directories()
             # Load and validate data
             success, df, message = self.load_and_validate_data()
@@ -706,261 +546,50 @@ class RobustModelTrainer:
                 random_state=self.random_state
             )
-            logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
-            results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
-            # Check if any models were trained successfully
-            successful_models = [name for name, result in results.items() if 'error' not in result]
-            if not successful_models:
-                return False, "❌ All model training attempts failed"
             # Select best model
-            best_model_name, best_model, best_metrics = self.select_best_model(results)
-            # Save model artifacts with FIXED paths
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
-                return False, "❌ Failed to save model artifacts"
             # Save evaluation results
             self.save_evaluation_results(results)
             success_message = (
-                f"Model training completed successfully!\n"
-                f"Best model: {best_model_name}\n"
-                f"Performance: F1={best_metrics['f1']:.4f}, Accuracy={best_metrics['accuracy']:.4f}\n"
-                f"Artifacts saved to: {PathConfig.MODEL_DIR}"
             )
             logger.info(success_message)
             return True, success_message
         except Exception as e:
-            error_message = f"❌ Model training failed: {str(e)}"
             logger.error(error_message)
-            logger.error(f"📍 Full traceback: {traceback.format_exc()}")
             return False, error_message
-# =============================================================================
-# TRAINING UTILITIES AND DIAGNOSTICS
-# =============================================================================
-class TrainingDiagnostics:
-    """Diagnostic utilities for training pipeline"""
-    @staticmethod
-    def check_data_availability():
-        """Check if training data is available"""
-        data_path = PathConfig.COMBINED_DATASET
-        if not data_path.exists():
-            logger.error(f"❌ Training data not found at: {data_path}")
-            # Check what files are available
-            if PathConfig.DATA_DIR.exists():
-                available_files = list(PathConfig.DATA_DIR.iterdir())
-                logger.info(f"Available files in data directory: {[f.name for f in available_files]}")
-            else:
-                logger.error(f"❌ Data directory doesn't exist: {PathConfig.DATA_DIR}")
-            return False
-        logger.info(f"✅ Training data found at: {data_path}")
-        return True
-    @staticmethod
-    def verify_model_output():
-        """Verify that model files were created correctly"""
-        files_to_check = {
-            'Pipeline': PathConfig.PIPELINE_FILE,
-            'Model': PathConfig.MODEL_FILE,
-            'Vectorizer': PathConfig.VECTORIZER_FILE,
-            'Metadata': PathConfig.METADATA_FILE
-        }
-        logger.info("🔍 Verifying model output files:")
-        all_exist = True
-        for file_type, file_path in files_to_check.items():
-            exists = file_path.exists()
-            size = file_path.stat().st_size if exists else 0
-            status = "✅" if exists else "❌"
-            logger.info(f"  {status} {file_type}: {file_path} ({size} bytes)")
-            if not exists:
-                all_exist = False
-        return all_exist
-    @staticmethod
-    def test_model_loading():
-        """Test if the saved model can be loaded correctly"""
-        try:
-            logger.info("🧪 Testing model loading...")
-            # Try loading pipeline
-            if PathConfig.PIPELINE_FILE.exists():
-                pipeline = joblib.load(PathConfig.PIPELINE_FILE)
-                logger.info("✅ Pipeline loaded successfully")
-                # Test prediction
-                test_text = ["This is a test article for verification."]
-                prediction = pipeline.predict(test_text)
-                logger.info(f"✅ Test prediction successful: {prediction}")
-                return True
-            else:
-                logger.error("❌ Pipeline file not found")
-                return False
-        except Exception as e:
-            logger.error(f"❌ Model loading test failed: {e}")
-            return False
-# ================================
-# ENHANCED MAIN EXECUTION FUNCTION
-# ================================
 def main():
-    """Enhanced main execution function with comprehensive diagnostics"""
-    import traceback
-    logger.info("🚀 Starting Enhanced Model Training Pipeline")
-    logger.info("=" * 60)
-    try:
-        # Step 1: Check data availability
-        logger.info("📋 Step 1: Checking data availability...")
-        if not TrainingDiagnostics.check_data_availability():
-            logger.error("❌ Training aborted: No data available")
-            print("❌ Training failed: Training data not found")
-            print(f"📁 Expected data location: {PathConfig.COMBINED_DATASET}")
-            print("💡 Please ensure the data preparation step has been completed")
-            exit(1)
-        # Step 2: Initialize trainer
-        logger.info("📋 Step 2: Initializing trainer...")
-        trainer = RobustModelTrainer()
-        # Step 3: Train model
-        logger.info("📋 Step 3: Training model...")
-        success, message = trainer.train_model()
-        if success:
-            # Step 4: Verify output
-            logger.info("📋 Step 4: Verifying model output...")
-            if TrainingDiagnostics.verify_model_output():
-                logger.info("✅ All model files created successfully")
-            else:
-                logger.warning("⚠️ Some model files may be missing")
-            # Step 5: Test model loading
-            logger.info("📋 Step 5: Testing model loading...")
-            if TrainingDiagnostics.test_model_loading():
-                logger.info("✅ Model loading verification successful")
-            else:
-                logger.warning("⚠️ Model loading verification failed")
-            # Success summary
-            logger.info("=" * 60)
-            logger.info("TRAINING COMPLETED SUCCESSFULLY!")
-            logger.info("=" * 60)
-            print("✅ Training completed successfully!")
-            print(f"{message}")
-            print(f"Model files saved to: {PathConfig.MODEL_DIR}")
-            print("Next steps:")
-            print("  1. Start the FastAPI server to test predictions")
-            print("  2. Run the monitoring dashboard")
-            print("  3. Perform model validation tests")
-        else:
-            logger.error("=" * 60)
-            logger.error("❌ TRAINING FAILED!")
-            logger.error("=" * 60)
-            print("❌ Training failed!")
-            print(f"📄 Error: {message}")
-            print("\n🔧 Troubleshooting steps:")
-            print("  1. Check if training data exists and is properly formatted")
-            print("  2. Verify sufficient disk space and memory")
-            print("  3. Review the training logs for detailed error information")
-            exit(1)
-    except KeyboardInterrupt:
-        logger.info("⏹️ Training interrupted by user")
-        print("\n⏹️ Training interrupted by user")
-        exit(1)
-    except Exception as e:
-        logger.error(f"Unexpected error during training: {str(e)}")
-        logger.error(f"Full traceback: {traceback.format_exc()}")
-        print(f"Unexpected error: {str(e)}")
-        print("Check the training logs for more details")
-        exit(1)
-# ============================
-# STANDALONE TESTING FUNCTIONS
-# ============================
-def test_path_configuration():
-    """Test path configuration and directory creation"""
-    print("🧪 Testing path configuration...")
-    PathConfig.ensure_directories()
-    directories = [
-        PathConfig.BASE_DIR, PathConfig.DATA_DIR,
-        PathConfig.MODEL_DIR, PathConfig.LOGS_DIR, PathConfig.RESULTS_DIR
-    ]
-    for directory in directories:
-        if directory.exists():
-            print(f"✅ {directory}")
-        else:
-            print(f"❌ {directory}")
-    print("\n Expected file locations:")
-    print(f"  Pipeline: {PathConfig.PIPELINE_FILE}")
-    print(f"  Model: {PathConfig.MODEL_FILE}")
-    print(f"  Vectorizer: {PathConfig.VECTORIZER_FILE}")
-    print(f"  Metadata: {PathConfig.METADATA_FILE}")
-def quick_data_check():
-    """Quick check of training data"""
-    print("Quick data check...")
-    data_path = PathConfig.COMBINED_DATASET
-    if data_path.exists():
-        try:
-            df = pd.read_csv(data_path)
-            print(f"Data loaded: {len(df)} rows, {len(df.columns)} columns")
-            print(f"Columns: {list(df.columns)}")
-            if 'label' in df.columns:
-                print(f"Label distribution: {df['label'].value_counts().to_dict()}")
-        except Exception as e:
-            print(f"❌ Error reading data: {e}")
     else:
-        print(f"❌ Data file not found: {data_path}")
 if __name__ == "__main__":
-    import sys
-    # Handle command line arguments for testing
-    if len(sys.argv) > 1:
-        if sys.argv[1] == "test-paths":
-            test_path_configuration()
-        elif sys.argv[1] == "test-data":
-            quick_data_check()
-        elif sys.argv[1] == "test-loading":
-            TrainingDiagnostics.test_model_loading()
-        else:
-            print("Available test commands:")
-            print("  python train.py test-paths     # Test path configuration")
-            print("  python train.py test-data      # Quick data check")
-            print("  python train.py test-loading   # Test model loading")
-    else:
-        # Run main training
-        main()

 import warnings
 warnings.filterwarnings('ignore')
+# Scikit-learn imports
+# Configure logging
 logging.basicConfig(
     level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
     handlers=[
+        logging.FileHandler('/tmp/model_training.log'),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
 class RobustModelTrainer:
+    """Production-ready model trainer with comprehensive evaluation and validation"""
     def __init__(self):
+        self.setup_paths()
         self.setup_training_config()
         self.setup_models()
+    def setup_paths(self):
+        """Setup all necessary paths"""
+        self.base_dir = Path("/tmp")
+        self.data_dir = self.base_dir / "data"
+        self.model_dir = self.base_dir / "model"
+        self.results_dir = self.base_dir / "results"
+        # Create directories
+        for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
+            dir_path.mkdir(parents=True, exist_ok=True)
+        # File paths
+        self.data_path = self.data_dir / "combined_dataset.csv"
+        self.model_path = self.model_dir / "model.pkl"
+        self.vectorizer_path = self.model_dir / "vectorizer.pkl"
+        self.pipeline_path = self.model_dir / "pipeline.pkl"
+        self.metadata_path = Path("/tmp/metadata.json")
+        self.evaluation_path = self.results_dir / "evaluation_results.json"
     def setup_training_config(self):
         """Setup training configuration"""
         }
     def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
+        """Load and validate training data"""
         try:
+            logger.info("Loading training data...")
+            if not self.data_path.exists():
+                return False, None, f"Data file not found: {self.data_path}"
             # Load data
+            df = pd.read_csv(self.data_path)
+            # Basic validation
+            if df.empty:
+                return False, None, "Dataset is empty"
+            required_columns = ['text', 'label']
+            missing_columns = [
+                col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                return False, None, f"Missing required columns: {missing_columns}"
             # Remove missing values
+            initial_count = len(df)
+            df = df.dropna(subset=required_columns)
             if len(df) < initial_count:
+                logger.warning(
+                    f"Removed {initial_count - len(df)} rows with missing values")
+            # Validate text content
+            df = df[df['text'].astype(str).str.len() > 10]
+            # Validate labels
+            unique_labels = df['label'].unique()
+            if len(unique_labels) < 2:
+                return False, None, f"Need at least 2 classes, found: {unique_labels}"
+            # Check minimum sample size
+            if len(df) < 100:
+                return False, None, f"Insufficient samples for training: {len(df)}"
+            # Check class balance
             label_counts = df['label'].value_counts()
+            min_class_ratio = label_counts.min() / label_counts.max()
+            if min_class_ratio < 0.1:
+                logger.warning(
+                    f"Severe class imbalance detected: {min_class_ratio:.3f}")
+            logger.info(
+                f"Data validation successful: {len(df)} samples, {len(unique_labels)} classes")
             logger.info(f"Class distribution: {label_counts.to_dict()}")
+            return True, df, "Data loaded successfully"
         except Exception as e:
             error_msg = f"Error loading data: {str(e)}"
             return False, None, error_msg
     def preprocess_text(self, text):
+        """Advanced text preprocessing"""
         import re
+        # Convert to string
+        text = str(text)
+        # Remove URLs
+        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', '', text)
+        # Remove excessive punctuation
+        text = re.sub(r'[!]{2,}', '!', text)
+        text = re.sub(r'[?]{2,}', '?', text)
+        text = re.sub(r'[.]{3,}', '...', text)
+        # Remove non-alphabetic characters except spaces and basic punctuation
+        text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip().lower()
     def create_preprocessing_pipeline(self) -> Pipeline:
+        """Create advanced preprocessing pipeline"""
         # Text preprocessing
         text_preprocessor = FunctionTransformer(
             func=lambda x: [self.preprocess_text(text) for text in x],
             ('model', None)  # Will be set during training
         ])
+        # After creating the pipeline
+        joblib.dump(pipeline, "/tmp/pipeline.pkl")  # Save complete pipeline
+        # Individual model
+        joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
+        # Individual vectorizer
+        joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
+        return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
         """Comprehensive model evaluation with multiple metrics"""
         logger.info("Starting comprehensive model evaluation...")
+        # Predictions
+        y_pred = model.predict(X_test)
+        y_pred_proba = model.predict_proba(X_test)[:, 1]
+        # Basic metrics
+        metrics = {
+            'accuracy': float(accuracy_score(y_test, y_pred)),
+            'precision': float(precision_score(y_test, y_pred, average='weighted')),
+            'recall': float(recall_score(y_test, y_pred, average='weighted')),
+            'f1': float(f1_score(y_test, y_pred, average='weighted')),
+            'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
+        }
+        # Confusion matrix
+        cm = confusion_matrix(y_test, y_pred)
+        metrics['confusion_matrix'] = cm.tolist()
+        # Classification report
+        class_report = classification_report(y_test, y_pred, output_dict=True)
+        metrics['classification_report'] = class_report
+        # Cross-validation scores if training data provided
+        if X_train is not None and y_train is not None:
             try:
+                cv_scores = cross_val_score(
+                    model, X_train, y_train,
+                    cv=StratifiedKFold(
+                        n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
+                    scoring='f1_weighted'
+                )
+                metrics['cv_scores'] = {
+                    'mean': float(cv_scores.mean()),
+                    'std': float(cv_scores.std()),
+                    'scores': cv_scores.tolist()
+                }
             except Exception as e:
+                logger.warning(f"Cross-validation failed: {e}")
+                metrics['cv_scores'] = None
+        # Feature importance (if available)
+        try:
+            if hasattr(model, 'feature_importances_'):
+                feature_importance = model.feature_importances_
+                metrics['feature_importance_stats'] = {
+                    'mean': float(feature_importance.mean()),
+                    'std': float(feature_importance.std()),
+                    'top_features': feature_importance.argsort()[-10:][::-1].tolist()
+                }
+            elif hasattr(model, 'coef_'):
+                coefficients = model.coef_[0]
+                metrics['coefficient_stats'] = {
+                    'mean': float(coefficients.mean()),
+                    'std': float(coefficients.std()),
+                    'top_positive': coefficients.argsort()[-10:][::-1].tolist(),
+                    'top_negative': coefficients.argsort()[:10].tolist()
+                }
+        except Exception as e:
+            logger.warning(f"Feature importance extraction failed: {e}")
+        # Model complexity metrics
+        try:
             # Training accuracy for overfitting detection
             if X_train is not None and y_train is not None:
+                y_train_pred = model.predict(X_train)
+                train_accuracy = accuracy_score(y_train, y_train_pred)
+                metrics['train_accuracy'] = float(train_accuracy)
+                metrics['overfitting_score'] = float(
+                    train_accuracy - metrics['accuracy'])
         except Exception as e:
+            logger.warning(f"Overfitting detection failed: {e}")
+        return metrics
     def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
         """Perform hyperparameter tuning with cross-validation"""
+        logger.info(f"Starting hyperparameter tuning for {model_name}...")
         try:
             # Set the model in the pipeline
             grid_search = GridSearchCV(
                 pipeline,
                 param_grid,
+                cv=StratifiedKFold(n_splits=self.cv_folds,
+                                   shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
                 n_jobs=-1,
                 verbose=1
                 'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
+                    'params': grid_search.cv_results_['params']
                 }
             }
             return grid_search.best_estimator_, tuning_results
         except Exception as e:
+            logger.error(
+                f"Hyperparameter tuning failed for {model_name}: {str(e)}")
             # Return basic model if tuning fails
+            pipeline.set_params(model=self.models[model_name]['model'])
+            pipeline.fit(X_train, y_train)
+            return pipeline, {'error': str(e)}
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
+        logger.info("Starting model training and evaluation...")
         results = {}
             logger.info(f"Training {model_name}...")
             try:
+                # Create pipeline
                 pipeline = self.create_preprocessing_pipeline()
                 # Hyperparameter tuning
                     'training_time': datetime.now().isoformat()
                 }
+                logger.info(f"Model {model_name} - F1: {evaluation_metrics['f1']:.4f}, "
                             f"Accuracy: {evaluation_metrics['accuracy']:.4f}")
             except Exception as e:
+                logger.error(f"Training failed for {model_name}: {str(e)}")
                 results[model_name] = {'error': str(e)}
         return results
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
         """Select the best performing model"""
+        logger.info("Selecting best model...")
         best_model_name = None
         best_model = None
         for model_name, result in results.items():
             if 'error' in result:
                 continue
             # Use F1 score as primary metric
                 best_metrics = result['evaluation_metrics']
         if best_model_name is None:
+            raise ValueError("No models trained successfully")
+        logger.info(
+            f"Best model: {best_model_name} with F1 score: {best_score:.4f}")
         return best_model_name, best_model, best_metrics
+    def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
+        """Save model artifacts and metadata"""
+        try:
+            logger.info("Saving model artifacts...")
+            # Save the full pipeline
+            joblib.dump(model, self.pipeline_path)
+            # Save individual components for backward compatibility
+            joblib.dump(model.named_steps['model'], self.model_path)
+            joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
+            # Generate data hash
+            data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
+            # Create metadata
+            metadata = {
+                'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+                'model_type': model_name,
+                'data_version': data_hash,
+                'train_size': metrics.get('train_accuracy', 'Unknown'),
+                'test_size': len(metrics.get('confusion_matrix', [[0]])[0]) if 'confusion_matrix' in metrics else 'Unknown',
+                'test_accuracy': metrics['accuracy'],
+                'test_f1': metrics['f1'],
+                'test_precision': metrics['precision'],
+                'test_recall': metrics['recall'],
+                'test_roc_auc': metrics['roc_auc'],
+                'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
+                'cv_score_mean': metrics.get('cv_scores', {}).get('mean', 'Unknown'),
+                'cv_score_std': metrics.get('cv_scores', {}).get('std', 'Unknown'),
+                'timestamp': datetime.now().isoformat(),
+                'training_config': {
+                    'test_size': self.test_size,
+                    'validation_size': self.validation_size,
+                    'cv_folds': self.cv_folds,
+                    'max_features': self.max_features,
+                    'ngram_range': self.ngram_range,
+                    'feature_selection_k': self.feature_selection_k
+                }
+            }
+            # Save metadata
+            with open(self.metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            logger.info(f"Model artifacts saved successfully")
+            logger.info(f"Model path: {self.model_path}")
+            logger.info(f"Vectorizer path: {self.vectorizer_path}")
+            logger.info(f"Pipeline path: {self.pipeline_path}")
+            logger.info(f"Metadata path: {self.metadata_path}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save model artifacts: {str(e)}")
+            return False
     def save_evaluation_results(self, results: Dict) -> bool:
         """Save comprehensive evaluation results"""
         try:
                     clean_results[model_name] = {
                         'tuning_results': {
                             k: v for k, v in result['tuning_results'].items()
+                            if k != 'best_estimator'
                         },
                         'evaluation_metrics': result['evaluation_metrics'],
                         'training_time': result['training_time']
                     }
+            # Save results
+            with open(self.evaluation_path, 'w') as f:
                 json.dump(clean_results, f, indent=2, default=str)
+            logger.info(f"Evaluation results saved to {self.evaluation_path}")
             return True
         except Exception as e:
+            logger.error(f"Failed to save evaluation results: {str(e)}")
             return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
         try:
+            logger.info("Starting model training pipeline...")
+            # Override data path if provided
+            if data_path:
+                self.data_path = Path(data_path)
             # Load and validate data
             success, df, message = self.load_and_validate_data()
                 random_state=self.random_state
             )
+            logger.info(
+                f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
+            results = self.train_and_evaluate_models(
+                X_train, X_test, y_train, y_test)
             # Select best model
+            best_model_name, best_model, best_metrics = self.select_best_model(
+                results)
+            # Save model artifacts
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
+                return False, "Failed to save model artifacts"
             # Save evaluation results
             self.save_evaluation_results(results)
             success_message = (
+                f"Model training completed successfully. "
+                f"Best model: {best_model_name} "
+                f"(F1: {best_metrics['f1']:.4f}, Accuracy: {best_metrics['accuracy']:.4f})"
             )
             logger.info(success_message)
             return True, success_message
         except Exception as e:
+            error_message = f"Model training failed: {str(e)}"
             logger.error(error_message)
             return False, error_message
 def main():
+    """Main execution function"""
+    trainer = RobustModelTrainer()
+    success, message = trainer.train_model()
+    if success:
+        print(f"✅ {message}")
     else:
+        print(f"❌ {message}")
+        exit(1)
 if __name__ == "__main__":
+    main()