Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

Ahmedik95316 commited on Aug 18

Commit

ca89c11

1 Parent(s): cc8f5e2

Update model/train.py

Key Fixes Applied to train.py:

1. FIXED PATH MANAGEMENT (Critical Bug Fix):
- Removed hardcoded paths like `"/tmp/pipeline.pkl"`
- Added centralized `PathConfig` class that matches `fastapi_server.py`
- Fixed save paths in `save_model_artifacts()`:
- Pipeline: `/tmp/model/pipeline.pkl` (was `/tmp/pipeline.pkl`)
- Model: `/tmp/model/model.pkl` (was `/tmp/model.pkl`)
- Vectorizer: `/tmp/model/vectorizer.pkl` (was `/tmp/vectorizer.pkl`)

2. Enhanced Error Handling:
- Added comprehensive data validation with `DataValidator` class
- Better exception handling throughout the training pipeline
- Graceful fallbacks when components fail

3. Added Diagnostics & Testing:
- `TrainingDiagnostics` class for verifying training output
- Path verification functions to debug issues
- Model loading tests to ensure artifacts work correctly
- Command-line testing options (`python train.py test-paths`)

4. Improved Robustness:
- Directory auto-creation with proper permissions
- Enhanced metadata generation with comprehensive model info
- Better logging with status indicators (✅❌⚠️)

5. Path Consistency Verification:
- Logs all paths during training for verification
- File existence checks after saving
- Size verification to ensure files aren't empty

The key problem was that:
- Before: `train.py` saved to `/tmp/pipeline.pkl` but `fastapi_server.py` looked in `/tmp/model/`
- After: Both use the same `PathConfig` and save/load from `/tmp/model/`

Files changed (1) hide show

model/train.py +711 -323

model/train.py CHANGED Viewed

@@ -1,3 +1,20 @@
 import pandas as pd
 import numpy as np
 from pathlib import Path
@@ -10,63 +27,162 @@ from typing import Dict, Tuple, Optional, Any
 import warnings
 warnings.filterwarnings('ignore')
-# Scikit-learn imports
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import (
-    train_test_split, cross_val_score, GridSearchCV,
-    StratifiedKFold, validation_curve
-)
-from sklearn.metrics import (
-    accuracy_score, precision_score, recall_score, f1_score,
-    roc_auc_score, confusion_matrix, classification_report,
-    precision_recall_curve, roc_curve
-)
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import FunctionTransformer
-from sklearn.feature_selection import SelectKBest, chi2
-import matplotlib.pyplot as plt
-import seaborn as sns
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
     handlers=[
-        logging.FileHandler('/tmp/model_training.log'),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
-class RobustModelTrainer:
-    """Production-ready model trainer with comprehensive evaluation and validation"""
     def __init__(self):
-        self.setup_paths()
         self.setup_training_config()
         self.setup_models()
-    def setup_paths(self):
-        """Setup all necessary paths"""
-        self.base_dir = Path("/tmp")
-        self.data_dir = self.base_dir / "data"
-        self.model_dir = self.base_dir / "model"
-        self.results_dir = self.base_dir / "results"
-        # Create directories
-        for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
-            dir_path.mkdir(parents=True, exist_ok=True)
-        # File paths
-        self.data_path = self.data_dir / "combined_dataset.csv"
-        self.model_path = self.model_dir / "model.pkl"
-        self.vectorizer_path = self.model_dir / "vectorizer.pkl"
-        self.pipeline_path = self.model_dir / "pipeline.pkl"
-        self.metadata_path = Path("/tmp/metadata.json")
-        self.evaluation_path = self.results_dir / "evaluation_results.json"
     def setup_training_config(self):
         """Setup training configuration"""
         self.test_size = 0.2
@@ -80,7 +196,7 @@ class RobustModelTrainer:
         self.max_iter = 1000
         self.class_weight = 'balanced'
         self.feature_selection_k = 5000
     def setup_models(self):
         """Setup model configurations for comparison"""
         self.models = {
@@ -109,95 +225,96 @@ class RobustModelTrainer:
                 }
             }
         }
     def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
-        """Load and validate training data"""
         try:
-            logger.info("Loading training data...")
-            if not self.data_path.exists():
-                return False, None, f"Data file not found: {self.data_path}"
             # Load data
-            df = pd.read_csv(self.data_path)
-            # Basic validation
-            if df.empty:
-                return False, None, "Dataset is empty"
-            required_columns = ['text', 'label']
-            missing_columns = [col for col in required_columns if col not in df.columns]
-            if missing_columns:
-                return False, None, f"Missing required columns: {missing_columns}"
-            # Remove missing values
             initial_count = len(df)
-            df = df.dropna(subset=required_columns)
-            if len(df) < initial_count:
-                logger.warning(f"Removed {initial_count - len(df)} rows with missing values")
-            # Validate text content
-            df = df[df['text'].astype(str).str.len() > 10]
-            # Validate labels
-            unique_labels = df['label'].unique()
-            if len(unique_labels) < 2:
-                return False, None, f"Need at least 2 classes, found: {unique_labels}"
-            # Check minimum sample size
-            if len(df) < 100:
-                return False, None, f"Insufficient samples for training: {len(df)}"
-            # Check class balance
             label_counts = df['label'].value_counts()
-            min_class_ratio = label_counts.min() / label_counts.max()
-            if min_class_ratio < 0.1:
-                logger.warning(f"Severe class imbalance detected: {min_class_ratio:.3f}")
-            logger.info(f"Data validation successful: {len(df)} samples, {len(unique_labels)} classes")
             logger.info(f"Class distribution: {label_counts.to_dict()}")
-            return True, df, "Data loaded successfully"
         except Exception as e:
             error_msg = f"Error loading data: {str(e)}"
             logger.error(error_msg)
             return False, None, error_msg
     def preprocess_text(self, text):
-        """Advanced text preprocessing"""
         import re
-        # Convert to string
-        text = str(text)
-        # Remove URLs
-        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
-        # Remove email addresses
-        text = re.sub(r'\S+@\S+', '', text)
-        # Remove excessive punctuation
-        text = re.sub(r'[!]{2,}', '!', text)
-        text = re.sub(r'[?]{2,}', '?', text)
-        text = re.sub(r'[.]{3,}', '...', text)
-        # Remove non-alphabetic characters except spaces and basic punctuation
-        text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
-        # Remove excessive whitespace
-        text = re.sub(r'\s+', ' ', text)
-        return text.strip().lower()
     def create_preprocessing_pipeline(self) -> Pipeline:
-        """Create advanced preprocessing pipeline"""
         # Text preprocessing
         text_preprocessor = FunctionTransformer(
             func=lambda x: [self.preprocess_text(text) for text in x],
             validate=False
         )
         # TF-IDF vectorization
         vectorizer = TfidfVectorizer(
             max_features=self.max_features,
@@ -208,13 +325,13 @@ class RobustModelTrainer:
             sublinear_tf=True,
             norm='l2'
         )
         # Feature selection
         feature_selector = SelectKBest(
             score_func=chi2,
             k=self.feature_selection_k
         )
         # Create pipeline
         pipeline = Pipeline([
             ('preprocess', text_preprocessor),
@@ -223,99 +340,205 @@ class RobustModelTrainer:
             ('model', None)  # Will be set during training
         ])
-        # Save the pipeline to .pkl format
-        joblib.dump(pipeline, "/tmp/pipeline.pkl")  # Save complete pipeline
-        joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")  # Individual model
-        joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")  # Individual vectorizer
         return pipeline
-    def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
-        """Comprehensive model evaluation with multiple metrics"""
-        logger.info("Starting comprehensive model evaluation...")
-        # Predictions
-        y_pred = model.predict(X_test)
-        y_pred_proba = model.predict_proba(X_test)[:, 1]
-        # Basic metrics
-        metrics = {
-            'accuracy': float(accuracy_score(y_test, y_pred)),
-            'precision': float(precision_score(y_test, y_pred, average='weighted')),
-            'recall': float(recall_score(y_test, y_pred, average='weighted')),
-            'f1': float(f1_score(y_test, y_pred, average='weighted')),
-            'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
-        }
-        # Confusion matrix
-        cm = confusion_matrix(y_test, y_pred)
-        metrics['confusion_matrix'] = cm.tolist()
-        # Classification report
-        class_report = classification_report(y_test, y_pred, output_dict=True)
-        metrics['classification_report'] = class_report
-        # Cross-validation scores if training data provided
-        if X_train is not None and y_train is not None:
             try:
-                cv_scores = cross_val_score(
-                    model, X_train, y_train,
-                    cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
-                    scoring='f1_weighted'
-                )
-                metrics['cv_scores'] = {
-                    'mean': float(cv_scores.mean()),
-                    'std': float(cv_scores.std()),
-                    'scores': cv_scores.tolist()
-                }
             except Exception as e:
-                logger.warning(f"Cross-validation failed: {e}")
-                metrics['cv_scores'] = None
-        # Feature importance (if available)
-        try:
-            if hasattr(model, 'feature_importances_'):
-                feature_importance = model.feature_importances_
-                metrics['feature_importance_stats'] = {
-                    'mean': float(feature_importance.mean()),
-                    'std': float(feature_importance.std()),
-                    'top_features': feature_importance.argsort()[-10:][::-1].tolist()
-                }
-            elif hasattr(model, 'coef_'):
-                coefficients = model.coef_[0]
-                metrics['coefficient_stats'] = {
-                    'mean': float(coefficients.mean()),
-                    'std': float(coefficients.std()),
-                    'top_positive': coefficients.argsort()[-10:][::-1].tolist(),
-                    'top_negative': coefficients.argsort()[:10].tolist()
-                }
         except Exception as e:
-            logger.warning(f"Feature importance extraction failed: {e}")
-        # Model complexity metrics
         try:
             # Training accuracy for overfitting detection
             if X_train is not None and y_train is not None:
-                y_train_pred = model.predict(X_train)
-                train_accuracy = accuracy_score(y_train, y_train_pred)
-                metrics['train_accuracy'] = float(train_accuracy)
-                metrics['overfitting_score'] = float(train_accuracy - metrics['accuracy'])
         except Exception as e:
-            logger.warning(f"Overfitting detection failed: {e}")
-        return metrics
     def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
         """Perform hyperparameter tuning with cross-validation"""
-        logger.info(f"Starting hyperparameter tuning for {model_name}...")
         try:
             # Set the model in the pipeline
             pipeline.set_params(model=self.models[model_name]['model'])
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
             # Create GridSearchCV
             grid_search = GridSearchCV(
                 pipeline,
@@ -325,10 +548,10 @@ class RobustModelTrainer:
                 n_jobs=-1,
                 verbose=1
             )
             # Fit grid search
             grid_search.fit(X_train, y_train)
             # Extract results
             tuning_results = {
                 'best_params': grid_search.best_params_,
@@ -337,46 +560,50 @@ class RobustModelTrainer:
                 'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
-                    'params': grid_search.cv_results_['params']
                 }
             }
             logger.info(f"Hyperparameter tuning completed for {model_name}")
             logger.info(f"Best score: {grid_search.best_score_:.4f}")
             logger.info(f"Best params: {grid_search.best_params_}")
             return grid_search.best_estimator_, tuning_results
         except Exception as e:
-            logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}")
             # Return basic model if tuning fails
-            pipeline.set_params(model=self.models[model_name]['model'])
-            pipeline.fit(X_train, y_train)
-            return pipeline, {'error': str(e)}
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
-        logger.info("Starting model training and evaluation...")
         results = {}
         for model_name in self.models.keys():
             logger.info(f"Training {model_name}...")
             try:
-                # Create pipeline
                 pipeline = self.create_preprocessing_pipeline()
                 # Hyperparameter tuning
                 best_model, tuning_results = self.hyperparameter_tuning(
                     pipeline, X_train, y_train, model_name
                 )
                 # Comprehensive evaluation
                 evaluation_metrics = self.comprehensive_evaluation(
                     best_model, X_test, y_test, X_train, y_train
                 )
                 # Store results
                 results[model_name] = {
                     'model': best_model,
@@ -384,101 +611,45 @@ class RobustModelTrainer:
                     'evaluation_metrics': evaluation_metrics,
                     'training_time': datetime.now().isoformat()
                 }
-                logger.info(f"Model {model_name} - F1: {evaluation_metrics['f1']:.4f}, "
-                           f"Accuracy: {evaluation_metrics['accuracy']:.4f}")
             except Exception as e:
-                logger.error(f"Training failed for {model_name}: {str(e)}")
                 results[model_name] = {'error': str(e)}
         return results
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
         """Select the best performing model"""
-        logger.info("Selecting best model...")
         best_model_name = None
         best_model = None
         best_score = -1
         best_metrics = None
         for model_name, result in results.items():
             if 'error' in result:
                 continue
             # Use F1 score as primary metric
             f1_score = result['evaluation_metrics']['f1']
             if f1_score > best_score:
                 best_score = f1_score
                 best_model_name = model_name
                 best_model = result['model']
                 best_metrics = result['evaluation_metrics']
         if best_model_name is None:
-            raise ValueError("No models trained successfully")
-        logger.info(f"Best model: {best_model_name} with F1 score: {best_score:.4f}")
         return best_model_name, best_model, best_metrics
-    def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
-        """Save model artifacts and metadata"""
-        try:
-            logger.info("Saving model artifacts...")
-            # Save the full pipeline
-            joblib.dump(model, self.pipeline_path)
-            # Save individual components for backward compatibility
-            joblib.dump(model.named_steps['model'], self.model_path)
-            joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
-            # Generate data hash
-            data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
-            # Create metadata
-            metadata = {
-                'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
-                'model_type': model_name,
-                'data_version': data_hash,
-                'train_size': metrics.get('train_accuracy', 'Unknown'),
-                'test_size': len(metrics.get('confusion_matrix', [[0]])[0]) if 'confusion_matrix' in metrics else 'Unknown',
-                'test_accuracy': metrics['accuracy'],
-                'test_f1': metrics['f1'],
-                'test_precision': metrics['precision'],
-                'test_recall': metrics['recall'],
-                'test_roc_auc': metrics['roc_auc'],
-                'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
-                'cv_score_mean': metrics.get('cv_scores', {}).get('mean', 'Unknown'),
-                'cv_score_std': metrics.get('cv_scores', {}).get('std', 'Unknown'),
-                'timestamp': datetime.now().isoformat(),
-                'training_config': {
-                    'test_size': self.test_size,
-                    'validation_size': self.validation_size,
-                    'cv_folds': self.cv_folds,
-                    'max_features': self.max_features,
-                    'ngram_range': self.ngram_range,
-                    'feature_selection_k': self.feature_selection_k
-                }
-            }
-            # Save metadata
-            with open(self.metadata_path, 'w') as f:
-                json.dump(metadata, f, indent=2)
-            logger.info(f"Model artifacts saved successfully")
-            logger.info(f"Model path: {self.model_path}")
-            logger.info(f"Vectorizer path: {self.vectorizer_path}")
-            logger.info(f"Pipeline path: {self.pipeline_path}")
-            logger.info(f"Metadata path: {self.metadata_path}")
-            return True
-        except Exception as e:
-            logger.error(f"Failed to save model artifacts: {str(e)}")
-            return False
     def save_evaluation_results(self, results: Dict) -> bool:
         """Save comprehensive evaluation results"""
         try:
@@ -490,89 +661,306 @@ class RobustModelTrainer:
                 else:
                     clean_results[model_name] = {
                         'tuning_results': {
-                            k: v for k, v in result['tuning_results'].items()
-                            if k != 'best_estimator'
                         },
                         'evaluation_metrics': result['evaluation_metrics'],
                         'training_time': result['training_time']
                     }
-            # Save results
-            with open(self.evaluation_path, 'w') as f:
                 json.dump(clean_results, f, indent=2, default=str)
-            logger.info(f"Evaluation results saved to {self.evaluation_path}")
             return True
         except Exception as e:
-            logger.error(f"Failed to save evaluation results: {str(e)}")
             return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
         try:
-            logger.info("Starting model training pipeline...")
-            # Override data path if provided
-            if data_path:
-                self.data_path = Path(data_path)
             # Load and validate data
             success, df, message = self.load_and_validate_data()
             if not success:
                 return False, message
             # Prepare data
             X = df['text'].values
             y = df['label'].values
             # Train-test split
             X_train, X_test, y_train, y_test = train_test_split(
-                X, y,
                 test_size=self.test_size,
                 stratify=y,
                 random_state=self.random_state
             )
             logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
             results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
             # Select best model
             best_model_name, best_model, best_metrics = self.select_best_model(results)
-            # Save model artifacts
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
-                return False, "Failed to save model artifacts"
             # Save evaluation results
             self.save_evaluation_results(results)
             success_message = (
-                f"Model training completed successfully. "
-                f"Best model: {best_model_name} "
-                f"(F1: {best_metrics['f1']:.4f}, Accuracy: {best_metrics['accuracy']:.4f})"
             )
             logger.info(success_message)
             return True, success_message
         except Exception as e:
-            error_message = f"Model training failed: {str(e)}"
             logger.error(error_message)
             return False, error_message
 def main():
-    """Main execution function"""
-    trainer = RobustModelTrainer()
-    success, message = trainer.train_model()
-    if success:
-        print(f"✅ {message}")
-    else:
-        print(f"�� {message}")
         exit(1)
 if __name__ == "__main__":
-    main()

+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.feature_selection import SelectKBest, chi2
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    roc_auc_score, confusion_matrix, classification_report,
+    precision_recall_curve, roc_curve
+)
+from sklearn.model_selection import (
+    train_test_split, cross_val_score, GridSearchCV,
+    StratifiedKFold, validation_curve
+)
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
 import numpy as np
 from pathlib import Path
 import warnings
 warnings.filterwarnings('ignore')
+# =============================================================================
+# CENTRALIZED PATH CONFIGURATION - MATCHES FASTAPI SERVER
+# =============================================================================
+class PathConfig:
+    """Centralized path management to ensure consistency across all components"""
+    # Base directories
+    BASE_DIR = Path("/tmp")
+    DATA_DIR = BASE_DIR / "data"
+    MODEL_DIR = BASE_DIR / "model"  # CONSISTENT: /tmp/model/
+    LOGS_DIR = BASE_DIR / "logs"
+    RESULTS_DIR = BASE_DIR / "results"
+    # Model files - CONSISTENT PATHS (matches fastapi_server.py)
+    MODEL_FILE = MODEL_DIR / "model.pkl"           # /tmp/model/model.pkl
+    VECTORIZER_FILE = MODEL_DIR / "vectorizer.pkl" # /tmp/model/vectorizer.pkl
+    PIPELINE_FILE = MODEL_DIR / "pipeline.pkl"     # /tmp/model/pipeline.pkl
+    METADATA_FILE = BASE_DIR / "metadata.json"     # /tmp/metadata.json
+    # Data files
+    COMBINED_DATASET = DATA_DIR / "combined_dataset.csv"
+    SCRAPED_DATA = DATA_DIR / "scraped_real.csv"
+    GENERATED_DATA = DATA_DIR / "generated_fake.csv"
+    # Log and result files
+    TRAINING_LOG = LOGS_DIR / "model_training.log"
+    EVALUATION_RESULTS = RESULTS_DIR / "evaluation_results.json"
+    @classmethod
+    def ensure_directories(cls):
+        """Create all required directories with proper permissions"""
+        for attr_name in dir(cls):
+            attr = getattr(cls, attr_name)
+            if isinstance(attr, Path) and attr_name.endswith('_DIR'):
+                attr.mkdir(parents=True, exist_ok=True, mode=0o755)
+        # Additional directory creation for safety
+        for directory in [cls.BASE_DIR, cls.DATA_DIR, cls.MODEL_DIR, cls.LOGS_DIR, cls.RESULTS_DIR]:
+            directory.mkdir(parents=True, exist_ok=True, mode=0o755)
+# Initialize directories at startup
+PathConfig.ensure_directories()
+# =============================================================================
+# ENHANCED LOGGING CONFIGURATION
+# =============================================================================
 logging.basicConfig(
     level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
     handlers=[
+        logging.FileHandler(PathConfig.TRAINING_LOG),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
+# =============================================================================
+# DATA VALIDATION PIPELINE
+# =============================================================================
+class DataValidator:
+    """Comprehensive data validation for training pipeline"""
+    def __init__(self, min_text_length: int = 10, max_null_ratio: float = 0.1):
+        self.min_text_length = min_text_length
+        self.max_null_ratio = max_null_ratio
+    def validate_schema(self, df: pd.DataFrame) -> Tuple[bool, list]:
+        """Validate data schema"""
+        errors = []
+        required_columns = ['text', 'label']
+        missing_cols = set(required_columns) - set(df.columns)
+        if missing_cols:
+            errors.append(f"Missing required columns: {missing_cols}")
+        return len(errors) == 0, errors
+    def validate_quality(self, df: pd.DataFrame) -> Tuple[bool, list]:
+        """Validate data quality"""
+        errors = []
+        # Check null ratio
+        null_ratio = df.isnull().sum().sum() / (len(df) * len(df.columns))
+        if null_ratio > self.max_null_ratio:
+            errors.append(f"Too many nulls: {null_ratio:.2%} > {self.max_null_ratio:.2%}")
+        # Check text quality
+        if 'text' in df.columns:
+            short_texts = (df['text'].astype(str).str.len() < self.min_text_length).sum()
+            if short_texts > 0:
+                errors.append(f"{short_texts} texts below minimum length ({self.min_text_length} chars)")
+        # Check minimum samples
+        if len(df) < 100:
+            errors.append(f"Insufficient samples for training: {len(df)} < 100")
+        # Check class distribution
+        if 'label' in df.columns:
+            unique_labels = df['label'].unique()
+            if len(unique_labels) < 2:
+                errors.append(f"Need at least 2 classes, found: {unique_labels}")
+            label_counts = df['label'].value_counts()
+            min_class_ratio = label_counts.min() / label_counts.max()
+            if min_class_ratio < 0.05:
+                errors.append(f"Severe class imbalance: {min_class_ratio:.3f}")
+            elif min_class_ratio < 0.1:
+                logger.warning(f"Class imbalance detected: {min_class_ratio:.3f}")
+        return len(errors) == 0, errors
+    def validate(self, df: pd.DataFrame) -> Tuple[bool, Dict[str, list]]:
+        """Complete data validation"""
+        all_valid = True
+        all_errors = {}
+        # Schema validation
+        schema_valid, schema_errors = self.validate_schema(df)
+        if not schema_valid:
+            all_valid = False
+            all_errors['schema'] = schema_errors
+        # Quality validation
+        quality_valid, quality_errors = self.validate_quality(df)
+        if not quality_valid:
+            all_valid = False
+            all_errors['quality'] = quality_errors
+        return all_valid, all_errors
+# =============================================================================
+# ENHANCED MODEL TRAINER WITH FIXED PATHS
+# =============================================================================
+class RobustModelTrainer:
+    """Production-ready model trainer with comprehensive evaluation and FIXED PATH MANAGEMENT"""
     def __init__(self):
+        # Use centralized path configuration
+        PathConfig.ensure_directories()
         self.setup_training_config()
         self.setup_models()
+        self.data_validator = DataValidator()
+        # Log path configuration for verification
+        logger.info("🔧 Path Configuration:")
+        logger.info(f"Model Directory: {PathConfig.MODEL_DIR}")
+        logger.info(f"Pipeline File: {PathConfig.PIPELINE_FILE}")
+        logger.info(f"Model File: {PathConfig.MODEL_FILE}")
+        logger.info(f"Vectorizer File: {PathConfig.VECTORIZER_FILE}")
+        logger.info(f"Metadata File: {PathConfig.METADATA_FILE}")
     def setup_training_config(self):
         """Setup training configuration"""
         self.test_size = 0.2
         self.max_iter = 1000
         self.class_weight = 'balanced'
         self.feature_selection_k = 5000
     def setup_models(self):
         """Setup model configurations for comparison"""
         self.models = {
                 }
             }
         }
     def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
+        """Load and validate training data with enhanced validation"""
         try:
+            logger.info("Loading and validating training data...")
+            data_path = PathConfig.COMBINED_DATASET
+            if not data_path.exists():
+                return False, None, f"Data file not found: {data_path}"
             # Load data
+            df = pd.read_csv(data_path)
+            logger.info(f"Loaded dataset with {len(df)} samples")
+            # Enhanced validation using DataValidator
+            valid, validation_errors = self.data_validator.validate(df)
+            if not valid:
+                error_msg = "Data validation failed:\n" + "\n".join([
+                    f"  {category}: {errors}" for category, errors in validation_errors.items()
+                ])
+                logger.error(error_msg)
+                return False, None, error_msg
+            # Clean data
             initial_count = len(df)
+            # Remove missing values
+            df = df.dropna(subset=['text', 'label'])
+            # Remove short texts
+            df = df[df['text'].astype(str).str.len() >= self.data_validator.min_text_length]
+            if len(df) < initial_count:
+                logger.info(f"🧹 Cleaned data: removed {initial_count - len(df)} invalid samples")
+            # Log final statistics
             label_counts = df['label'].value_counts()
+            logger.info(f"Data validation successful: {len(df)} samples")
             logger.info(f"Class distribution: {label_counts.to_dict()}")
+            return True, df, "Data loaded and validated successfully"
         except Exception as e:
             error_msg = f"Error loading data: {str(e)}"
             logger.error(error_msg)
             return False, None, error_msg
     def preprocess_text(self, text):
+        """Advanced text preprocessing with better error handling"""
         import re
+        try:
+            # Convert to string
+            text = str(text)
+            # Remove URLs
+            text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+            # Remove email addresses
+            text = re.sub(r'\S+@\S+', '', text)
+            # Remove excessive punctuation
+            text = re.sub(r'[!]{2,}', '!', text)
+            text = re.sub(r'[?]{2,}', '?', text)
+            text = re.sub(r'[.]{3,}', '...', text)
+            # Remove non-alphabetic characters except spaces and basic punctuation
+            text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
+            # Remove excessive whitespace
+            text = re.sub(r'\s+', ' ', text)
+            return text.strip().lower()
+        except Exception as e:
+            logger.warning(f"Text preprocessing failed for text, returning original: {e}")
+            return str(text).lower()
     def create_preprocessing_pipeline(self) -> Pipeline:
+        """Create advanced preprocessing pipeline with FIXED saving"""
+        logger.info("🔧 Creating preprocessing pipeline...")
         # Text preprocessing
         text_preprocessor = FunctionTransformer(
             func=lambda x: [self.preprocess_text(text) for text in x],
             validate=False
         )
         # TF-IDF vectorization
         vectorizer = TfidfVectorizer(
             max_features=self.max_features,
             sublinear_tf=True,
             norm='l2'
         )
         # Feature selection
         feature_selector = SelectKBest(
             score_func=chi2,
             k=self.feature_selection_k
         )
         # Create pipeline
         pipeline = Pipeline([
             ('preprocess', text_preprocessor),
             ('model', None)  # Will be set during training
         ])
+        logger.info("Preprocessing pipeline created successfully")
         return pipeline
+    def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
+        """Save model artifacts with FIXED PATHS and comprehensive error handling"""
+        try:
+            logger.info("💾 Saving model artifacts with corrected paths...")
+            # FIXED: Use centralized path configuration
+            pipeline_path = PathConfig.PIPELINE_FILE     # /tmp/model/pipeline.pkl
+            model_path = PathConfig.MODEL_FILE           # /tmp/model/model.pkl
+            vectorizer_path = PathConfig.VECTORIZER_FILE # /tmp/model/vectorizer.pkl
+            metadata_path = PathConfig.METADATA_FILE     # /tmp/metadata.json
+            logger.info(f"Saving to paths:")
+            logger.info(f"  Pipeline: {pipeline_path}")
+            logger.info(f"  Model: {model_path}")
+            logger.info(f"  Vectorizer: {vectorizer_path}")
+            logger.info(f"  Metadata: {metadata_path}")
+            # Save the complete pipeline (FIXED PATH)
+            joblib.dump(model, pipeline_path)
+            logger.info("Saved complete pipeline")
+            # Save individual components for backward compatibility (FIXED PATHS)
             try:
+                if hasattr(model, 'named_steps'):
+                    # Save individual model
+                    if 'model' in model.named_steps and model.named_steps['model'] is not None:
+                        joblib.dump(model.named_steps['model'], model_path)
+                        logger.info("Saved individual model component")
+                    # Save individual vectorizer
+                    if 'vectorize' in model.named_steps and model.named_steps['vectorize'] is not None:
+                        joblib.dump(model.named_steps['vectorize'], vectorizer_path)
+                        logger.info("Saved individual vectorizer component")
+                else:
+                    logger.warning("Model doesn't have named_steps, skipping individual component saves")
             except Exception as e:
+                logger.warning(f"Could not save individual components: {e}")
+            # Generate comprehensive metadata
+            metadata = self.generate_metadata(model_name, metrics)
+            # Save metadata (FIXED PATH)
+            with open(metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            logger.info("Saved model metadata")
+            # Verify all files were created
+            verification_results = {
+                'pipeline': pipeline_path.exists(),
+                'model': model_path.exists(),
+                'vectorizer': vectorizer_path.exists(),
+                'metadata': metadata_path.exists()
+            }
+            logger.info("🔍 File verification results:")
+            for file_type, exists in verification_results.items():
+                status = "✅" if exists else "❌"
+                logger.info(f"  {status} {file_type}: {exists}")
+            # Check if at least the pipeline was saved
+            if not verification_results['pipeline']:
+                raise Exception("Critical: Pipeline file was not created")
+            logger.info("🎉 Model artifacts saved successfully!")
+            return True
         except Exception as e:
+            logger.error(f"❌ Failed to save model artifacts: {str(e)}")
+            return False
+    def generate_metadata(self, model_name: str, metrics: Dict) -> Dict:
+        """Generate comprehensive metadata"""
+        # Generate data hash for versioning
+        data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8]
+        metadata = {
+            'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            'model_type': model_name,
+            'data_version': data_hash,
+            'training_metrics': {
+                'test_accuracy': metrics.get('accuracy', 'Unknown'),
+                'test_f1': metrics.get('f1', 'Unknown'),
+                'test_precision': metrics.get('precision', 'Unknown'),
+                'test_recall': metrics.get('recall', 'Unknown'),
+                'test_roc_auc': metrics.get('roc_auc', 'Unknown'),
+                'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
+                'cv_score_mean': metrics.get('cv_scores', {}).get('mean', 'Unknown'),
+                'cv_score_std': metrics.get('cv_scores', {}).get('std', 'Unknown')
+            },
+            'training_config': {
+                'test_size': self.test_size,
+                'validation_size': self.validation_size,
+                'cv_folds': self.cv_folds,
+                'max_features': self.max_features,
+                'ngram_range': self.ngram_range,
+                'feature_selection_k': self.feature_selection_k,
+                'class_weight': self.class_weight
+            },
+            'paths': {
+                'pipeline_file': str(PathConfig.PIPELINE_FILE),
+                'model_file': str(PathConfig.MODEL_FILE),
+                'vectorizer_file': str(PathConfig.VECTORIZER_FILE)
+            },
+            'timestamp': datetime.now().isoformat(),
+            'training_completed': True
+        }
+        return metadata
+    def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
+        """Comprehensive model evaluation with multiple metrics"""
+        logger.info("Starting comprehensive model evaluation...")
         try:
+            # Predictions
+            y_pred = model.predict(X_test)
+            y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
+            # Basic metrics
+            metrics = {
+                'accuracy': float(accuracy_score(y_test, y_pred)),
+                'precision': float(precision_score(y_test, y_pred, average='weighted', zero_division=0)),
+                'recall': float(recall_score(y_test, y_pred, average='weighted', zero_division=0)),
+                'f1': float(f1_score(y_test, y_pred, average='weighted', zero_division=0))
+            }
+            # ROC AUC if probabilities available
+            if y_pred_proba is not None:
+                try:
+                    metrics['roc_auc'] = float(roc_auc_score(y_test, y_pred_proba))
+                except Exception as e:
+                    logger.warning(f"Could not calculate ROC AUC: {e}")
+                    metrics['roc_auc'] = 0.0
+            else:
+                metrics['roc_auc'] = 0.0
+            # Confusion matrix
+            cm = confusion_matrix(y_test, y_pred)
+            metrics['confusion_matrix'] = cm.tolist()
+            # Classification report
+            try:
+                class_report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
+                metrics['classification_report'] = class_report
+            except Exception as e:
+                logger.warning(f"Could not generate classification report: {e}")
+            # Cross-validation scores if training data provided
+            if X_train is not None and y_train is not None:
+                try:
+                    cv_scores = cross_val_score(
+                        model, X_train, y_train,
+                        cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
+                        scoring='f1_weighted'
+                    )
+                    metrics['cv_scores'] = {
+                        'mean': float(cv_scores.mean()),
+                        'std': float(cv_scores.std()),
+                        'scores': cv_scores.tolist()
+                    }
+                except Exception as e:
+                    logger.warning(f"Cross-validation failed: {e}")
+                    metrics['cv_scores'] = {'mean': 0.0, 'std': 0.0, 'scores': []}
             # Training accuracy for overfitting detection
             if X_train is not None and y_train is not None:
+                try:
+                    y_train_pred = model.predict(X_train)
+                    train_accuracy = accuracy_score(y_train, y_train_pred)
+                    metrics['train_accuracy'] = float(train_accuracy)
+                    metrics['overfitting_score'] = float(train_accuracy - metrics['accuracy'])
+                except Exception as e:
+                    logger.warning(f"Overfitting detection failed: {e}")
+            logger.info(f"📈 Evaluation completed - F1: {metrics['f1']:.4f}, Accuracy: {metrics['accuracy']:.4f}")
+            return metrics
         except Exception as e:
+            logger.error(f"❌ Evaluation failed: {e}")
+            return {
+                'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0,
+                'f1': 0.0, 'roc_auc': 0.0, 'error': str(e)
+            }
     def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
         """Perform hyperparameter tuning with cross-validation"""
+        logger.info(f"🔧 Starting hyperparameter tuning for {model_name}...")
         try:
             # Set the model in the pipeline
             pipeline.set_params(model=self.models[model_name]['model'])
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
             # Create GridSearchCV
             grid_search = GridSearchCV(
                 pipeline,
                 n_jobs=-1,
                 verbose=1
             )
             # Fit grid search
             grid_search.fit(X_train, y_train)
             # Extract results
             tuning_results = {
                 'best_params': grid_search.best_params_,
                 'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
+                    'params': [dict(p) for p in grid_search.cv_results_['params']]
                 }
             }
             logger.info(f"Hyperparameter tuning completed for {model_name}")
             logger.info(f"Best score: {grid_search.best_score_:.4f}")
             logger.info(f"Best params: {grid_search.best_params_}")
             return grid_search.best_estimator_, tuning_results
         except Exception as e:
+            logger.error(f"❌ Hyperparameter tuning failed for {model_name}: {str(e)}")
             # Return basic model if tuning fails
+            try:
+                pipeline.set_params(model=self.models[model_name]['model'])
+                pipeline.fit(X_train, y_train)
+                return pipeline, {'error': str(e), 'used_default_params': True}
+            except Exception as e2:
+                logger.error(f"❌ Even basic model training failed: {str(e2)}")
+                raise e2
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
+        logger.info("🚀 Starting model training and evaluation...")
         results = {}
         for model_name in self.models.keys():
             logger.info(f"Training {model_name}...")
             try:
+                # Create fresh pipeline for each model
                 pipeline = self.create_preprocessing_pipeline()
                 # Hyperparameter tuning
                 best_model, tuning_results = self.hyperparameter_tuning(
                     pipeline, X_train, y_train, model_name
                 )
                 # Comprehensive evaluation
                 evaluation_metrics = self.comprehensive_evaluation(
                     best_model, X_test, y_test, X_train, y_train
                 )
                 # Store results
                 results[model_name] = {
                     'model': best_model,
                     'evaluation_metrics': evaluation_metrics,
                     'training_time': datetime.now().isoformat()
                 }
+                logger.info(f"✅ Model {model_name} - F1: {evaluation_metrics['f1']:.4f}, "
+                            f"Accuracy: {evaluation_metrics['accuracy']:.4f}")
             except Exception as e:
+                logger.error(f"❌ Training failed for {model_name}: {str(e)}")
                 results[model_name] = {'error': str(e)}
         return results
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
         """Select the best performing model"""
+        logger.info("🏆 Selecting best model...")
         best_model_name = None
         best_model = None
         best_score = -1
         best_metrics = None
         for model_name, result in results.items():
             if 'error' in result:
+                logger.warning(f"Skipping {model_name} due to error: {result['error']}")
                 continue
             # Use F1 score as primary metric
             f1_score = result['evaluation_metrics']['f1']
             if f1_score > best_score:
                 best_score = f1_score
                 best_model_name = model_name
                 best_model = result['model']
                 best_metrics = result['evaluation_metrics']
         if best_model_name is None:
+            raise ValueError("❌ No models trained successfully")
+        logger.info(f"🏆 Best model: {best_model_name} with F1 score: {best_score:.4f}")
         return best_model_name, best_model, best_metrics
     def save_evaluation_results(self, results: Dict) -> bool:
         """Save comprehensive evaluation results"""
         try:
                 else:
                     clean_results[model_name] = {
                         'tuning_results': {
+                            k: v for k, v in result['tuning_results'].items()
+                            if k != 'best_estimator'  # Can't serialize sklearn objects
                         },
                         'evaluation_metrics': result['evaluation_metrics'],
                         'training_time': result['training_time']
                     }
+            # Save results to centralized path
+            evaluation_path = PathConfig.EVALUATION_RESULTS
+            with open(evaluation_path, 'w') as f:
                 json.dump(clean_results, f, indent=2, default=str)
+            logger.info(f"📊 Evaluation results saved to {evaluation_path}")
             return True
         except Exception as e:
+            logger.error(f"❌ Failed to save evaluation results: {str(e)}")
             return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
         try:
+            logger.info("🚀 Starting model training pipeline...")
+            # Log system information
+            logger.info(f"Training environment: {PathConfig.BASE_DIR}")
+            PathConfig.ensure_directories()
             # Load and validate data
             success, df, message = self.load_and_validate_data()
             if not success:
                 return False, message
             # Prepare data
             X = df['text'].values
             y = df['label'].values
             # Train-test split
             X_train, X_test, y_train, y_test = train_test_split(
+                X, y,
                 test_size=self.test_size,
                 stratify=y,
                 random_state=self.random_state
             )
             logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
             results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
+            # Check if any models were trained successfully
+            successful_models = [name for name, result in results.items() if 'error' not in result]
+            if not successful_models:
+                return False, "❌ All model training attempts failed"
             # Select best model
             best_model_name, best_model, best_metrics = self.select_best_model(results)
+            # Save model artifacts with FIXED paths
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
+                return False, "❌ Failed to save model artifacts"
             # Save evaluation results
             self.save_evaluation_results(results)
             success_message = (
+                f"Model training completed successfully!\n"
+                f"Best model: {best_model_name}\n"
+                f"Performance: F1={best_metrics['f1']:.4f}, Accuracy={best_metrics['accuracy']:.4f}\n"
+                f"Artifacts saved to: {PathConfig.MODEL_DIR}"
             )
             logger.info(success_message)
             return True, success_message
         except Exception as e:
+            error_message = f"❌ Model training failed: {str(e)}"
             logger.error(error_message)
+            logger.error(f"📍 Full traceback: {traceback.format_exc()}")
             return False, error_message
+# =============================================================================
+# TRAINING UTILITIES AND DIAGNOSTICS
+# =============================================================================
+class TrainingDiagnostics:
+    """Diagnostic utilities for training pipeline"""
+    @staticmethod
+    def check_data_availability():
+        """Check if training data is available"""
+        data_path = PathConfig.COMBINED_DATASET
+        if not data_path.exists():
+            logger.error(f"❌ Training data not found at: {data_path}")
+            # Check what files are available
+            if PathConfig.DATA_DIR.exists():
+                available_files = list(PathConfig.DATA_DIR.iterdir())
+                logger.info(f"Available files in data directory: {[f.name for f in available_files]}")
+            else:
+                logger.error(f"❌ Data directory doesn't exist: {PathConfig.DATA_DIR}")
+            return False
+        logger.info(f"✅ Training data found at: {data_path}")
+        return True
+    @staticmethod
+    def verify_model_output():
+        """Verify that model files were created correctly"""
+        files_to_check = {
+            'Pipeline': PathConfig.PIPELINE_FILE,
+            'Model': PathConfig.MODEL_FILE,
+            'Vectorizer': PathConfig.VECTORIZER_FILE,
+            'Metadata': PathConfig.METADATA_FILE
+        }
+        logger.info("🔍 Verifying model output files:")
+        all_exist = True
+        for file_type, file_path in files_to_check.items():
+            exists = file_path.exists()
+            size = file_path.stat().st_size if exists else 0
+            status = "✅" if exists else "❌"
+            logger.info(f"  {status} {file_type}: {file_path} ({size} bytes)")
+            if not exists:
+                all_exist = False
+        return all_exist
+    @staticmethod
+    def test_model_loading():
+        """Test if the saved model can be loaded correctly"""
+        try:
+            logger.info("🧪 Testing model loading...")
+            # Try loading pipeline
+            if PathConfig.PIPELINE_FILE.exists():
+                pipeline = joblib.load(PathConfig.PIPELINE_FILE)
+                logger.info("✅ Pipeline loaded successfully")
+                # Test prediction
+                test_text = ["This is a test article for verification."]
+                prediction = pipeline.predict(test_text)
+                logger.info(f"✅ Test prediction successful: {prediction}")
+                return True
+            else:
+                logger.error("❌ Pipeline file not found")
+                return False
+        except Exception as e:
+            logger.error(f"❌ Model loading test failed: {e}")
+            return False
+# ================================
+# ENHANCED MAIN EXECUTION FUNCTION
+# ================================
 def main():
+    """Enhanced main execution function with comprehensive diagnostics"""
+    import traceback
+    logger.info("🚀 Starting Enhanced Model Training Pipeline")
+    logger.info("=" * 60)
+    try:
+        # Step 1: Check data availability
+        logger.info("📋 Step 1: Checking data availability...")
+        if not TrainingDiagnostics.check_data_availability():
+            logger.error("❌ Training aborted: No data available")
+            print("❌ Training failed: Training data not found")
+            print(f"📁 Expected data location: {PathConfig.COMBINED_DATASET}")
+            print("💡 Please ensure the data preparation step has been completed")
+            exit(1)
+        # Step 2: Initialize trainer
+        logger.info("📋 Step 2: Initializing trainer...")
+        trainer = RobustModelTrainer()
+        # Step 3: Train model
+        logger.info("📋 Step 3: Training model...")
+        success, message = trainer.train_model()
+        if success:
+            # Step 4: Verify output
+            logger.info("📋 Step 4: Verifying model output...")
+            if TrainingDiagnostics.verify_model_output():
+                logger.info("✅ All model files created successfully")
+            else:
+                logger.warning("⚠️ Some model files may be missing")
+            # Step 5: Test model loading
+            logger.info("📋 Step 5: Testing model loading...")
+            if TrainingDiagnostics.test_model_loading():
+                logger.info("✅ Model loading verification successful")
+            else:
+                logger.warning("⚠️ Model loading verification failed")
+            # Success summary
+            logger.info("=" * 60)
+            logger.info("TRAINING COMPLETED SUCCESSFULLY!")
+            logger.info("=" * 60)
+            print("✅ Training completed successfully!")
+            print(f"{message}")
+            print(f"Model files saved to: {PathConfig.MODEL_DIR}")
+            print("Next steps:")
+            print("  1. Start the FastAPI server to test predictions")
+            print("  2. Run the monitoring dashboard")
+            print("  3. Perform model validation tests")
+        else:
+            logger.error("=" * 60)
+            logger.error("❌ TRAINING FAILED!")
+            logger.error("=" * 60)
+            print("❌ Training failed!")
+            print(f"📄 Error: {message}")
+            print("\n🔧 Troubleshooting steps:")
+            print("  1. Check if training data exists and is properly formatted")
+            print("  2. Verify sufficient disk space and memory")
+            print("  3. Review the training logs for detailed error information")
+            exit(1)
+    except KeyboardInterrupt:
+        logger.info("⏹️ Training interrupted by user")
+        print("\n⏹️ Training interrupted by user")
         exit(1)
+    except Exception as e:
+        logger.error(f"Unexpected error during training: {str(e)}")
+        logger.error(f"Full traceback: {traceback.format_exc()}")
+        print(f"Unexpected error: {str(e)}")
+        print("Check the training logs for more details")
+        exit(1)
+# ============================
+# STANDALONE TESTING FUNCTIONS
+# ============================
+def test_path_configuration():
+    """Test path configuration and directory creation"""
+    print("🧪 Testing path configuration...")
+    PathConfig.ensure_directories()
+    directories = [
+        PathConfig.BASE_DIR, PathConfig.DATA_DIR,
+        PathConfig.MODEL_DIR, PathConfig.LOGS_DIR, PathConfig.RESULTS_DIR
+    ]
+    for directory in directories:
+        if directory.exists():
+            print(f"✅ {directory}")
+        else:
+            print(f"❌ {directory}")
+    print("\n Expected file locations:")
+    print(f"  Pipeline: {PathConfig.PIPELINE_FILE}")
+    print(f"  Model: {PathConfig.MODEL_FILE}")
+    print(f"  Vectorizer: {PathConfig.VECTORIZER_FILE}")
+    print(f"  Metadata: {PathConfig.METADATA_FILE}")
+def quick_data_check():
+    """Quick check of training data"""
+    print("Quick data check...")
+    data_path = PathConfig.COMBINED_DATASET
+    if data_path.exists():
+        try:
+            df = pd.read_csv(data_path)
+            print(f"Data loaded: {len(df)} rows, {len(df.columns)} columns")
+            print(f"Columns: {list(df.columns)}")
+            if 'label' in df.columns:
+                print(f"Label distribution: {df['label'].value_counts().to_dict()}")
+        except Exception as e:
+            print(f"❌ Error reading data: {e}")
+    else:
+        print(f"❌ Data file not found: {data_path}")
 if __name__ == "__main__":
+    import sys
+    # Handle command line arguments for testing
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "test-paths":
+            test_path_configuration()
+        elif sys.argv[1] == "test-data":
+            quick_data_check()
+        elif sys.argv[1] == "test-loading":
+            TrainingDiagnostics.test_model_loading()
+        else:
+            print("Available test commands:")
+            print("  python train.py test-paths     # Test path configuration")
+            print("  python train.py test-data      # Quick data check")
+            print("  python train.py test-loading   # Test model loading")
+    else:
+        # Run main training
+        main()