Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 29

Commit

2d38242

verified ·

1 Parent(s): 6041335

Update initialize_system.py

Browse files

Modified to run the ensemble training at the start

Files changed (1) hide show

initialize_system.py +99 -108

initialize_system.py CHANGED Viewed

@@ -1,11 +1,19 @@
 import os
 import sys
 import json
 import shutil
 import pandas as pd
 from pathlib import Path
 from datetime import datetime
 from sklearn.model_selection import cross_validate
 # Import the new path manager
 try:
@@ -181,7 +189,7 @@ def create_minimal_dataset():
 def run_initial_training():
-    """Run basic model training"""
     log_step("Starting initial model training...")
     try:
@@ -196,41 +204,92 @@ def run_initial_training():
         # Check if model already exists
         if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
-            log_step("✅ Model files already exist, checking if pipeline needs to be created...")
-            # If individual components exist but pipeline doesn't, create pipeline
-            if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
-                log_step("Creating pipeline from existing components...")
                 try:
                     import joblib
-                    from sklearn.pipeline import Pipeline
-                    # Load existing components
-                    model = joblib.load(model_path)
-                    vectorizer = joblib.load(vectorizer_path)
-                    # Create pipeline
-                    pipeline = Pipeline([
-                        ('vectorizer', vectorizer),
-                        ('model', model)
-                    ])
-                    # Save pipeline
-                    joblib.dump(pipeline, pipeline_path)
-                    log_step(f"✅ Created pipeline from existing components: {pipeline_path}")
                 except Exception as e:
-                    log_step(f"⚠️ Failed to create pipeline from existing components: {e}")
             return True
-        # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
-        from sklearn.model_selection import train_test_split
-        from sklearn.metrics import accuracy_score, f1_score
         from sklearn.pipeline import Pipeline
         import joblib
         # Load dataset
         dataset_path = path_manager.get_combined_dataset_path()
@@ -259,7 +318,7 @@ def run_initial_training():
             X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
         )
-        # Create pipeline with preprocessing
         pipeline = Pipeline([
             ('vectorizer', TfidfVectorizer(
                 max_features=5000,
@@ -276,9 +335,9 @@ def run_initial_training():
         ])
         # Train model with cross-validation
-        log_step("Training model with cross-validation...")
-        # Perform cross-validation before final training
         cv_results = cross_validate(
             pipeline, X_train, y_train,
             cv=3,
@@ -294,63 +353,11 @@ def run_initial_training():
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
-        # Save CV results for API access
-        cv_data = {
-            "n_splits": 3,
-            "test_scores": {
-                "accuracy": {
-                    "mean": float(cv_results['test_accuracy'].mean()),
-                    "std": float(cv_results['test_accuracy'].std()),
-                    "scores": cv_results['test_accuracy'].tolist()
-                },
-                "f1": {
-                    "mean": float(cv_results['test_f1_weighted'].mean()),
-                    "std": float(cv_results['test_f1_weighted'].std()),
-                    "scores": cv_results['test_f1_weighted'].tolist()
-                }
-            },
-            "train_scores": {
-                "accuracy": {
-                    "mean": float(cv_results['train_accuracy'].mean()),
-                    "std": float(cv_results['train_accuracy'].std()),
-                    "scores": cv_results['train_accuracy'].tolist()
-                },
-                "f1": {
-                    "mean": float(cv_results['train_f1_weighted'].mean()),
-                    "std": float(cv_results['train_f1_weighted'].std()),
-                    "scores": cv_results['train_f1_weighted'].tolist()
-                }
-            }
-        }
-        # Save CV results to file
-        cv_results_path = path_manager.get_logs_path("cv_results.json")
-        with open(cv_results_path, 'w') as f:
-            json.dump(cv_data, f, indent=2)
-        log_step(f"Saved CV results to: {cv_results_path}")
-        # Ensure model directory exists
-        model_path.parent.mkdir(parents=True, exist_ok=True)
-        # Save complete pipeline FIRST (this is the priority)
-        log_step(f"Saving pipeline to: {pipeline_path}")
         joblib.dump(pipeline, pipeline_path)
-        # Verify pipeline was saved
-        if pipeline_path.exists():
-            log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
-            # Test loading the pipeline
-            try:
-                test_pipeline = joblib.load(pipeline_path)
-                test_pred = test_pipeline.predict(["This is a test"])
-                log_step(f"✅ Pipeline verification successful: {test_pred}")
-            except Exception as e:
-                log_step(f"⚠️ Pipeline verification failed: {e}")
-        else:
-            log_step(f"❌ Pipeline was not saved to {pipeline_path}")
-        # Save individual components for backward compatibility
         try:
             joblib.dump(pipeline.named_steps['model'], model_path)
             joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
@@ -358,45 +365,29 @@ def run_initial_training():
         except Exception as e:
             log_step(f"⚠️ Failed to save individual components: {e}")
-        # Save metadata
         metadata = {
-            "model_version": "v1.0_init",
             "model_type": "logistic_regression_pipeline",
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
-            "train_size": len(X_train),
-            "test_size": len(X_test),
             "timestamp": datetime.now().isoformat(),
-            "training_method": "initialization",
-            "environment": path_manager.environment,
-            "data_path": str(dataset_path),
-            "class_distribution": class_counts.to_dict(),
-            "pipeline_created": pipeline_path.exists(),
-            "individual_components_created": model_path.exists() and vectorizer_path.exists(),
-            # Add CV results to metadata
-            "cv_f1_mean": float(cv_results['test_f1_weighted'].mean()),
-            "cv_f1_std": float(cv_results['test_f1_weighted'].std()),
-            "cv_accuracy_mean": float(cv_results['test_accuracy'].mean()),
-            "cv_accuracy_std": float(cv_results['test_accuracy'].std())
         }
         metadata_path = path_manager.get_metadata_path()
         with open(metadata_path, 'w') as f:
             json.dump(metadata, f, indent=2)
-        log_step(f"✅ Training completed successfully")
         log_step(f"   Accuracy: {accuracy:.4f}")
         log_step(f"   F1 Score: {f1:.4f}")
-        log_step(f"   Pipeline saved: {pipeline_path.exists()}")
-        log_step(f"   Model saved to: {model_path}")
-        log_step(f"   Vectorizer saved to: {vectorizer_path}")
         return True
     except Exception as e:
-        log_step(f"❌ Training failed: {str(e)}")
-        import traceback
-        log_step(f"❌ Traceback: {traceback.format_exc()}")
         return False

 import os
 import sys
 import json
+import joblib
 import shutil
 import pandas as pd
 from pathlib import Path
 from datetime import datetime
+from sklearn.pipeline import Pipeline
+from model.train import EnhancedModelTrainer
 from sklearn.model_selection import cross_validate
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.feature_extraction.text import TfidfVectorizer
 # Import the new path manager
 try:
 def run_initial_training():
+    """Run enhanced ensemble model training with LightGBM"""
     log_step("Starting initial model training...")
     try:
         # Check if model already exists
         if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
+            log_step("✅ Model files already exist, skipping training")
+            return True
+        # Import enhanced training components
+        import sys
+        sys.path.append('/app')
+        from model.train import EnhancedModelTrainer
+        log_step("Using Enhanced Model Trainer with ensemble voting...")
+        # Create enhanced trainer with full ensemble configuration
+        trainer = EnhancedModelTrainer(
+            use_enhanced_features=True,  # Enable sentiment, readability, entities, linguistic features
+            enable_ensemble=True         # Enable LightGBM + Random Forest + Logistic Regression ensemble
+        )
+        # Override paths to use the initialization system paths
+        trainer.data_path = path_manager.get_combined_dataset_path()
+        trainer.pipeline_path = pipeline_path
+        trainer.model_path = model_path
+        trainer.vectorizer_path = vectorizer_path
+        trainer.metadata_path = path_manager.get_metadata_path()
+        log_step("Starting enhanced ensemble training (this may take several minutes)...")
+        # Run the full enhanced training
+        success, message = trainer.train_model()
+        if success:
+            log_step(f"✅ Enhanced ensemble training completed: {message}")
+            # Verify pipeline was created
+            if pipeline_path.exists():
+                log_step(f"✅ Enhanced pipeline saved successfully to {pipeline_path}")
+                # Test loading the pipeline
                 try:
                     import joblib
+                    test_pipeline = joblib.load(pipeline_path)
+                    test_pred = test_pipeline.predict(["This is a test article"])
+                    log_step(f"✅ Enhanced pipeline verification successful: {test_pred}")
                 except Exception as e:
+                    log_step(f"⚠️ Enhanced pipeline verification failed: {e}")
+            else:
+                log_step(f"❌ Enhanced pipeline was not saved to {pipeline_path}")
+                return False
             return True
+        else:
+            log_step(f"❌ Enhanced ensemble training failed: {message}")
+            # Fall back to basic training if enhanced training fails
+            log_step("Falling back to basic training...")
+            return run_initial_training()
+    except ImportError as e:
+        log_step(f"⚠️ Enhanced training components not available: {e}")
+        log_step("Falling back to basic training...")
+        return run_basic_training_fallback()
+    except Exception as e:
+        log_step(f"❌ Enhanced training failed: {str(e)}")
+        import traceback
+        log_step(f"❌ Traceback: {traceback.format_exc()}")
+        log_step("Falling back to basic training...")
+        return run_basic_training_fallback()
+def run_basic_training_fallback():
+    """Fallback to basic training if enhanced training fails"""
+    log_step("Running basic training fallback...")
+    try:
+        # Import required libraries for basic training
+        import pandas as pd
+        from sklearn.model_selection import train_test_split, cross_validate
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
         from sklearn.pipeline import Pipeline
+        from sklearn.metrics import accuracy_score, f1_score
         import joblib
+        import json
+        from datetime import datetime
+        # Get paths
+        model_path = path_manager.get_model_file_path()
+        vectorizer_path = path_manager.get_vectorizer_path()
+        pipeline_path = path_manager.get_pipeline_path()
         # Load dataset
         dataset_path = path_manager.get_combined_dataset_path()
             X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
         )
+        # Create basic pipeline
         pipeline = Pipeline([
             ('vectorizer', TfidfVectorizer(
                 max_features=5000,
         ])
         # Train model with cross-validation
+        log_step("Training basic model with cross-validation...")
+        # Perform cross-validation
         cv_results = cross_validate(
             pipeline, X_train, y_train,
             cv=3,
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
+        # Save pipeline
+        log_step(f"Saving basic pipeline to: {pipeline_path}")
         joblib.dump(pipeline, pipeline_path)
+        # Save individual components for compatibility
         try:
             joblib.dump(pipeline.named_steps['model'], model_path)
             joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
         except Exception as e:
             log_step(f"⚠️ Failed to save individual components: {e}")
+        # Save basic metadata
         metadata = {
+            "model_version": "v1.0_basic_fallback",
             "model_type": "logistic_regression_pipeline",
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
             "timestamp": datetime.now().isoformat(),
+            "training_method": "basic_fallback",
+            "environment": path_manager.environment
         }
         metadata_path = path_manager.get_metadata_path()
         with open(metadata_path, 'w') as f:
             json.dump(metadata, f, indent=2)
+        log_step(f"✅ Basic training completed successfully")
         log_step(f"   Accuracy: {accuracy:.4f}")
         log_step(f"   F1 Score: {f1:.4f}")
         return True
     except Exception as e:
+        log_step(f"❌ Basic training fallback also failed: {str(e)}")
         return False