Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

719d51e

1 Parent(s): c29bcf3

Update model/train.py

Browse files

Files changed (1) hide show

model/train.py +201 -109

model/train.py CHANGED Viewed

@@ -22,7 +22,10 @@ import logging
 import json
 import joblib
 import hashlib
-from datetime import datetime
 from typing import Dict, Tuple, Optional, Any
 import warnings
 import re
@@ -75,6 +78,114 @@ def preprocess_text_function(texts):
     return processed
 class RobustModelTrainer:
     """Production-ready model trainer with comprehensive evaluation and validation"""
@@ -82,6 +193,7 @@ class RobustModelTrainer:
         self.setup_paths()
         self.setup_training_config()
         self.setup_models()
     def setup_paths(self):
         """Setup all necessary paths"""
@@ -107,14 +219,14 @@ class RobustModelTrainer:
         self.test_size = 0.2
         self.validation_size = 0.1
         self.random_state = 42
-        self.cv_folds = 5
-        self.max_features = 10000
-        self.min_df = 2
         self.max_df = 0.95
-        self.ngram_range = (1, 3)
-        self.max_iter = 1000
         self.class_weight = 'balanced'
-        self.feature_selection_k = 5000
     def setup_models(self):
         """Setup model configurations for comparison"""
@@ -123,24 +235,24 @@ class RobustModelTrainer:
                 'model': LogisticRegression(
                     max_iter=self.max_iter,
                     class_weight=self.class_weight,
-                    random_state=self.random_state
                 ),
                 'param_grid': {
-                    'model__C': [0.1, 1, 10, 100],
-                    'model__penalty': ['l2'],
-                    'model__solver': ['liblinear', 'lbfgs']
                 }
             },
             'random_forest': {
                 'model': RandomForestClassifier(
-                    n_estimators=100,
                     class_weight=self.class_weight,
-                    random_state=self.random_state
                 ),
                 'param_grid': {
-                    'model__n_estimators': [50, 100, 200],
-                    'model__max_depth': [10, 20, None],
-                    'model__min_samples_split': [2, 5, 10]
                 }
             }
         }
@@ -149,6 +261,8 @@ class RobustModelTrainer:
         """Load and validate training data"""
         try:
             logger.info("Loading training data...")
             if not self.data_path.exists():
                 return False, None, f"Data file not found: {self.data_path}"
@@ -182,7 +296,7 @@ class RobustModelTrainer:
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
             # Check minimum sample size
-            if len(df) < 100:
                 return False, None, f"Insufficient samples for training: {len(df)}"
             # Check class balance
@@ -204,15 +318,18 @@ class RobustModelTrainer:
             return False, None, error_msg
     def create_preprocessing_pipeline(self) -> Pipeline:
-        """Create advanced preprocessing pipeline - pickle-safe"""
         # Use the standalone function instead of lambda
         text_preprocessor = FunctionTransformer(
-            func=preprocess_text_function,  # ✅ Pickle-safe function reference
             validate=False
         )
-        # TF-IDF vectorization
         vectorizer = TfidfVectorizer(
             max_features=self.max_features,
             min_df=self.min_df,
@@ -226,7 +343,7 @@ class RobustModelTrainer:
         # Feature selection
         feature_selector = SelectKBest(
             score_func=chi2,
-            k=self.feature_selection_k
         )
         # Create pipeline
@@ -241,8 +358,10 @@ class RobustModelTrainer:
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
         """Comprehensive model evaluation with multiple metrics"""
-        logger.info("Starting comprehensive model evaluation...")
         # Predictions
         y_pred = model.predict(X_test)
         y_pred_proba = model.predict_proba(X_test)[:, 1]
@@ -260,18 +379,18 @@ class RobustModelTrainer:
         cm = confusion_matrix(y_test, y_pred)
         metrics['confusion_matrix'] = cm.tolist()
-        # Classification report
-        class_report = classification_report(y_test, y_pred, output_dict=True)
-        metrics['classification_report'] = class_report
         # Cross-validation scores if training data provided
-        if X_train is not None and y_train is not None:
             try:
                 cv_scores = cross_val_score(
                     model, X_train, y_train,
                     cv=StratifiedKFold(
-                        n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
-                    scoring='f1_weighted'
                 )
                 metrics['cv_scores'] = {
                     'mean': float(cv_scores.mean()),
@@ -281,30 +400,11 @@ class RobustModelTrainer:
             except Exception as e:
                 logger.warning(f"Cross-validation failed: {e}")
                 metrics['cv_scores'] = None
-        # Feature importance (if available)
-        try:
-            if hasattr(model, 'feature_importances_'):
-                feature_importance = model.feature_importances_
-                metrics['feature_importance_stats'] = {
-                    'mean': float(feature_importance.mean()),
-                    'std': float(feature_importance.std()),
-                    'top_features': feature_importance.argsort()[-10:][::-1].tolist()
-                }
-            elif hasattr(model, 'coef_'):
-                coefficients = model.coef_[0]
-                metrics['coefficient_stats'] = {
-                    'mean': float(coefficients.mean()),
-                    'std': float(coefficients.std()),
-                    'top_positive': coefficients.argsort()[-10:][::-1].tolist(),
-                    'top_negative': coefficients.argsort()[:10].tolist()
-                }
-        except Exception as e:
-            logger.warning(f"Feature importance extraction failed: {e}")
-        # Model complexity metrics
         try:
-            # Training accuracy for overfitting detection
             if X_train is not None and y_train is not None:
                 y_train_pred = model.predict(X_train)
                 train_accuracy = accuracy_score(y_train, y_train_pred)
@@ -318,7 +418,9 @@ class RobustModelTrainer:
     def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
         """Perform hyperparameter tuning with cross-validation"""
-        logger.info(f"Starting hyperparameter tuning for {model_name}...")
         try:
             # Set the model in the pipeline
@@ -327,15 +429,18 @@ class RobustModelTrainer:
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
             # Create GridSearchCV
             grid_search = GridSearchCV(
                 pipeline,
                 param_grid,
-                cv=StratifiedKFold(n_splits=self.cv_folds,
                                    shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
-                n_jobs=-1,
-                verbose=1
             )
             # Fit grid search
@@ -369,8 +474,7 @@ class RobustModelTrainer:
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
-        logger.info("Starting model training and evaluation...")
         results = {}
         for model_name in self.models.keys():
@@ -409,7 +513,9 @@ class RobustModelTrainer:
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
         """Select the best performing model"""
-        logger.info("Selecting best model...")
         best_model_name = None
         best_model = None
@@ -439,7 +545,8 @@ class RobustModelTrainer:
     def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
         """Save model artifacts and metadata"""
         try:
-            logger.info("Saving model artifacts...")
             # Save the full pipeline
             joblib.dump(model, self.pipeline_path)
@@ -449,14 +556,10 @@ class RobustModelTrainer:
             if hasattr(model, 'named_steps') and 'model' in model.named_steps:
                 joblib.dump(model.named_steps['model'], self.model_path)
                 logger.info(f"✅ Saved model to {self.model_path}")
-            else:
-                logger.warning("❌ Could not extract model component")
             if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
                 joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
                 logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
-            else:
-                logger.warning("❌ Could not extract vectorizer component")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
@@ -466,8 +569,6 @@ class RobustModelTrainer:
                 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 'model_type': model_name,
                 'data_version': data_hash,
-                'train_size': metrics.get('train_accuracy', 'Unknown'),
-                'test_size': len(metrics.get('confusion_matrix', [[0]])[0]) if 'confusion_matrix' in metrics else 'Unknown',
                 'test_accuracy': metrics['accuracy'],
                 'test_f1': metrics['f1'],
                 'test_precision': metrics['precision'],
@@ -479,7 +580,6 @@ class RobustModelTrainer:
                 'timestamp': datetime.now().isoformat(),
                 'training_config': {
                     'test_size': self.test_size,
-                    'validation_size': self.validation_size,
                     'cv_folds': self.cv_folds,
                     'max_features': self.max_features,
                     'ngram_range': self.ngram_range,
@@ -492,46 +592,12 @@ class RobustModelTrainer:
                 json.dump(metadata, f, indent=2)
             logger.info(f"✅ Model artifacts saved successfully")
-            logger.info(f"Model path: {self.model_path}")
-            logger.info(f"Vectorizer path: {self.vectorizer_path}")
-            logger.info(f"Pipeline path: {self.pipeline_path}")
-            logger.info(f"Metadata path: {self.metadata_path}")
             return True
         except Exception as e:
             logger.error(f"Failed to save model artifacts: {str(e)}")
             return False
-    def save_evaluation_results(self, results: Dict) -> bool:
-        """Save comprehensive evaluation results"""
-        try:
-            # Clean results for JSON serialization
-            clean_results = {}
-            for model_name, result in results.items():
-                if 'error' in result:
-                    clean_results[model_name] = result
-                else:
-                    clean_results[model_name] = {
-                        'tuning_results': {
-                            k: v for k, v in result['tuning_results'].items()
-                            if k != 'best_estimator'
-                        },
-                        'evaluation_metrics': result['evaluation_metrics'],
-                        'training_time': result['training_time']
-                    }
-            # Save results
-            with open(self.evaluation_path, 'w') as f:
-                json.dump(clean_results, f, indent=2, default=str)
-            logger.info(f"Evaluation results saved to {self.evaluation_path}")
-            return True
-        except Exception as e:
-            logger.error(f"Failed to save evaluation results: {str(e)}")
-            return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
         try:
@@ -546,35 +612,52 @@ class RobustModelTrainer:
             if not success:
                 return False, message
             # Prepare data
             X = df['text'].values
             y = df['label'].values
             # Train-test split
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y,
                 test_size=self.test_size,
-                stratify=y,
                 random_state=self.random_state
             )
-            logger.info(
-                f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
             results = self.train_and_evaluate_models(
                 X_train, X_test, y_train, y_test)
             # Select best model
-            best_model_name, best_model, best_metrics = self.select_best_model(
-                results)
             # Save model artifacts
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
                 return False, "Failed to save model artifacts"
-            # Save evaluation results
-            self.save_evaluation_results(results)
             success_message = (
                 f"Model training completed successfully. "
@@ -586,6 +669,8 @@ class RobustModelTrainer:
             return True, success_message
         except Exception as e:
             error_message = f"Model training failed: {str(e)}"
             logger.error(error_message)
             return False, error_message
@@ -593,8 +678,15 @@ class RobustModelTrainer:
 def main():
     """Main execution function"""
     trainer = RobustModelTrainer()
-    success, message = trainer.train_model()
     if success:
         print(f"✅ {message}")

 import json
 import joblib
 import hashlib
+import sys
+import os
+import time
+from datetime import datetime, timedelta
 from typing import Dict, Tuple, Optional, Any
 import warnings
 import re
     return processed
+class ProgressTracker:
+    """Progress tracking with time estimation"""
+    def __init__(self, total_steps: int, description: str = "Training"):
+        self.total_steps = total_steps
+        self.current_step = 0
+        self.start_time = time.time()
+        self.description = description
+        self.step_times = []
+    def update(self, step_name: str = ""):
+        """Update progress and print status"""
+        self.current_step += 1
+        current_time = time.time()
+        elapsed = current_time - self.start_time
+        # Calculate progress percentage
+        progress_pct = (self.current_step / self.total_steps) * 100
+        # Estimate remaining time
+        if self.current_step > 0:
+            avg_time_per_step = elapsed / self.current_step
+            remaining_steps = self.total_steps - self.current_step
+            eta_seconds = avg_time_per_step * remaining_steps
+            eta = timedelta(seconds=int(eta_seconds))
+        else:
+            eta = "calculating..."
+        # Create progress bar
+        bar_length = 30
+        filled_length = int(bar_length * self.current_step // self.total_steps)
+        bar = '█' * filled_length + '░' * (bar_length - filled_length)
+        # Print progress
+        status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
+        if step_name:
+            status_msg += f" | {step_name}"
+        if eta != "calculating...":
+            status_msg += f" | ETA: {eta}"
+        print(status_msg, end='', flush=True)
+        # Store step time for better estimation
+        if len(self.step_times) >= 3:  # Keep last 3 step times for moving average
+            self.step_times.pop(0)
+        self.step_times.append(current_time - (self.start_time + sum(self.step_times)))
+    def finish(self):
+        """Complete progress tracking"""
+        total_time = time.time() - self.start_time
+        print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
+def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 3) -> Dict:
+    """Estimate training time based on dataset characteristics"""
+    # Base time estimates (in seconds) based on empirical testing
+    base_times = {
+        'preprocessing': max(0.1, dataset_size * 0.001),  # ~1ms per sample
+        'vectorization': max(0.5, dataset_size * 0.01),   # ~10ms per sample
+        'feature_selection': max(0.2, dataset_size * 0.005), # ~5ms per sample
+        'simple_training': max(1.0, dataset_size * 0.02),  # ~20ms per sample
+        'evaluation': max(0.5, dataset_size * 0.01),       # ~10ms per sample
+    }
+    # Hyperparameter tuning multipliers
+    tuning_multipliers = {
+        'logistic_regression': 8 if enable_tuning else 1,  # 8 param combinations
+        'random_forest': 12 if enable_tuning else 1,       # 12 param combinations
+    }
+    # Cross-validation multiplier
+    cv_multiplier = cv_folds if dataset_size > 100 else 1
+    # Calculate estimates
+    estimates = {}
+    # Preprocessing steps
+    estimates['data_loading'] = 0.5
+    estimates['preprocessing'] = base_times['preprocessing']
+    estimates['vectorization'] = base_times['vectorization']
+    estimates['feature_selection'] = base_times['feature_selection']
+    # Model training
+    for model_name, multiplier in tuning_multipliers.items():
+        model_time = base_times['simple_training'] * multiplier * cv_multiplier
+        estimates[f'{model_name}_training'] = model_time
+        estimates[f'{model_name}_evaluation'] = base_times['evaluation']
+    # Model saving
+    estimates['model_saving'] = 1.0
+    # Total estimate
+    total_estimate = sum(estimates.values())
+    # Add 20% buffer for overhead
+    total_estimate *= 1.2
+    return {
+        'detailed_estimates': estimates,
+        'total_seconds': total_estimate,
+        'total_formatted': str(timedelta(seconds=int(total_estimate))),
+        'dataset_size': dataset_size,
+        'enable_tuning': enable_tuning,
+        'cv_folds': cv_folds
+    }
 class RobustModelTrainer:
     """Production-ready model trainer with comprehensive evaluation and validation"""
         self.setup_paths()
         self.setup_training_config()
         self.setup_models()
+        self.progress_tracker = None
     def setup_paths(self):
         """Setup all necessary paths"""
         self.test_size = 0.2
         self.validation_size = 0.1
         self.random_state = 42
+        self.cv_folds = 3
+        self.max_features = 5000  # Reduced for speed
+        self.min_df = 1  # More lenient for small datasets
         self.max_df = 0.95
+        self.ngram_range = (1, 2)  # Reduced for speed
+        self.max_iter = 500  # Reduced for speed
         self.class_weight = 'balanced'
+        self.feature_selection_k = 2000  # Reduced for speed
     def setup_models(self):
         """Setup model configurations for comparison"""
                 'model': LogisticRegression(
                     max_iter=self.max_iter,
                     class_weight=self.class_weight,
+                    random_state=self.random_state,
+                    n_jobs=-1  # Use all cores
                 ),
                 'param_grid': {
+                    'model__C': [0.1, 1, 10],  # Reduced grid
+                    'model__penalty': ['l2']
                 }
             },
             'random_forest': {
                 'model': RandomForestClassifier(
+                    n_estimators=50,  # Reduced for speed
                     class_weight=self.class_weight,
+                    random_state=self.random_state,
+                    n_jobs=-1  # Use all cores
                 ),
                 'param_grid': {
+                    'model__n_estimators': [50, 100],  # Reduced grid
+                    'model__max_depth': [10, None]
                 }
             }
         }
         """Load and validate training data"""
         try:
             logger.info("Loading training data...")
+            if self.progress_tracker:
+                self.progress_tracker.update("Loading data")
             if not self.data_path.exists():
                 return False, None, f"Data file not found: {self.data_path}"
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
             # Check minimum sample size
+            if len(df) < 10:
                 return False, None, f"Insufficient samples for training: {len(df)}"
             # Check class balance
             return False, None, error_msg
     def create_preprocessing_pipeline(self) -> Pipeline:
+        """Create preprocessing pipeline"""
+        if self.progress_tracker:
+            self.progress_tracker.update("Creating pipeline")
         # Use the standalone function instead of lambda
         text_preprocessor = FunctionTransformer(
+            func=preprocess_text_function,
             validate=False
         )
+        # TF-IDF vectorization with optimized parameters
         vectorizer = TfidfVectorizer(
             max_features=self.max_features,
             min_df=self.min_df,
         # Feature selection
         feature_selector = SelectKBest(
             score_func=chi2,
+            k=min(self.feature_selection_k, self.max_features)
         )
         # Create pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
         """Comprehensive model evaluation with multiple metrics"""
+        if self.progress_tracker:
+            self.progress_tracker.update("Evaluating model")
         # Predictions
         y_pred = model.predict(X_test)
         y_pred_proba = model.predict_proba(X_test)[:, 1]
         cm = confusion_matrix(y_test, y_pred)
         metrics['confusion_matrix'] = cm.tolist()
         # Cross-validation scores if training data provided
+        if X_train is not None and y_train is not None and len(X_train) >= 50:
             try:
                 cv_scores = cross_val_score(
                     model, X_train, y_train,
                     cv=StratifiedKFold(
+                        n_splits=min(self.cv_folds, len(X_train) // 10),
+                        shuffle=True,
+                        random_state=self.random_state
+                    ),
+                    scoring='f1_weighted',
+                    n_jobs=-1  # Parallel CV
                 )
                 metrics['cv_scores'] = {
                     'mean': float(cv_scores.mean()),
             except Exception as e:
                 logger.warning(f"Cross-validation failed: {e}")
                 metrics['cv_scores'] = None
+        else:
+            metrics['cv_scores'] = {'note': 'Skipped for small dataset'}
+        # Training accuracy for overfitting detection
         try:
             if X_train is not None and y_train is not None:
                 y_train_pred = model.predict(X_train)
                 train_accuracy = accuracy_score(y_train, y_train_pred)
     def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
         """Perform hyperparameter tuning with cross-validation"""
+        if self.progress_tracker:
+            self.progress_tracker.update(f"Tuning {model_name}")
         try:
             # Set the model in the pipeline
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
+            # Adaptive CV folds based on dataset size
+            cv_folds = min(self.cv_folds, len(X_train) // 10, 5)
             # Create GridSearchCV
             grid_search = GridSearchCV(
                 pipeline,
                 param_grid,
+                cv=StratifiedKFold(n_splits=cv_folds,
                                    shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
+                n_jobs=-1,  # Use all cores
+                verbose=0   # Reduce verbosity for speed
             )
             # Fit grid search
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
         """Train and evaluate multiple models"""
         results = {}
         for model_name in self.models.keys():
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
         """Select the best performing model"""
+        if self.progress_tracker:
+            self.progress_tracker.update("Selecting best model")
         best_model_name = None
         best_model = None
     def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
         """Save model artifacts and metadata"""
         try:
+            if self.progress_tracker:
+                self.progress_tracker.update("Saving model")
             # Save the full pipeline
             joblib.dump(model, self.pipeline_path)
             if hasattr(model, 'named_steps') and 'model' in model.named_steps:
                 joblib.dump(model.named_steps['model'], self.model_path)
                 logger.info(f"✅ Saved model to {self.model_path}")
             if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
                 joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
                 logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
                 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 'model_type': model_name,
                 'data_version': data_hash,
                 'test_accuracy': metrics['accuracy'],
                 'test_f1': metrics['f1'],
                 'test_precision': metrics['precision'],
                 'timestamp': datetime.now().isoformat(),
                 'training_config': {
                     'test_size': self.test_size,
                     'cv_folds': self.cv_folds,
                     'max_features': self.max_features,
                     'ngram_range': self.ngram_range,
                 json.dump(metadata, f, indent=2)
             logger.info(f"✅ Model artifacts saved successfully")
             return True
         except Exception as e:
             logger.error(f"Failed to save model artifacts: {str(e)}")
             return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
         """Main training function with comprehensive pipeline"""
         try:
             if not success:
                 return False, message
+            # Estimate training time and setup progress tracker
+            time_estimate = estimate_training_time(
+                len(df),
+                enable_tuning=True,
+                cv_folds=self.cv_folds
+            )
+            print(f"\n📊 Training Configuration:")
+            print(f"Dataset size: {len(df)} samples")
+            print(f"Estimated time: {time_estimate['total_formatted']}")
+            print(f"Models to train: {len(self.models)}")
+            print(f"Cross-validation folds: {self.cv_folds}")
+            print()
+            # Setup progress tracker
+            total_steps = 4 + (len(self.models) * 2) + 1  # Load, split, 2*models, select, save
+            self.progress_tracker = ProgressTracker(total_steps, "Training Progress")
             # Prepare data
             X = df['text'].values
             y = df['label'].values
             # Train-test split
+            self.progress_tracker.update("Splitting data")
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y,
                 test_size=self.test_size,
+                stratify=y if len(np.unique(y)) > 1 and len(y) > 10 else None,
                 random_state=self.random_state
             )
+            logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
             # Train and evaluate models
             results = self.train_and_evaluate_models(
                 X_train, X_test, y_train, y_test)
             # Select best model
+            best_model_name, best_model, best_metrics = self.select_best_model(results)
             # Save model artifacts
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
                 return False, "Failed to save model artifacts"
+            # Finish progress tracking
+            self.progress_tracker.finish()
             success_message = (
                 f"Model training completed successfully. "
             return True, success_message
         except Exception as e:
+            if self.progress_tracker:
+                print()  # New line after progress bar
             error_message = f"Model training failed: {str(e)}"
             logger.error(error_message)
             return False, error_message
 def main():
     """Main execution function"""
+    import argparse
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Train fake news detection model')
+    parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
+    args = parser.parse_args()
     trainer = RobustModelTrainer()
+    success, message = trainer.train_model(data_path=args.data_path)
     if success:
         print(f"✅ {message}")