Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 29

Commit

f984f56

verified ·

1 Parent(s): a028318

Update initialize_system.py

Browse files

Files changed (1) hide show

initialize_system.py +122 -28

initialize_system.py CHANGED Viewed

@@ -193,25 +193,51 @@ def create_minimal_dataset():
 def run_initial_training():
     """Run basic model training"""
-    log_step("Running basic training fallback...")
-    try:
-        # Import required libraries for basic training
-        import pandas as pd
-        from sklearn.model_selection import train_test_split, cross_validate
-        from sklearn.feature_extraction.text import TfidfVectorizer
-        from sklearn.linear_model import LogisticRegression
-        from sklearn.pipeline import Pipeline
-        from sklearn.metrics import accuracy_score, f1_score
-        import joblib
-        import json
-        from datetime import datetime
-        # Get paths
         model_path = path_manager.get_model_file_path()
         vectorizer_path = path_manager.get_vectorizer_path()
         pipeline_path = path_manager.get_pipeline_path()
         # Load dataset
         dataset_path = path_manager.get_combined_dataset_path()
         if not dataset_path.exists():
@@ -239,7 +265,7 @@ def run_initial_training():
             X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
         )
-        # Create basic pipeline
         pipeline = Pipeline([
             ('vectorizer', TfidfVectorizer(
                 max_features=5000,
@@ -256,9 +282,9 @@ def run_initial_training():
         ])
         # Train model with cross-validation
-        log_step("Training basic model with cross-validation...")
-        # Perform cross-validation
         cv_results = cross_validate(
             pipeline, X_train, y_train,
             cv=3,
@@ -274,11 +300,63 @@ def run_initial_training():
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
-        # Save pipeline
-        log_step(f"Saving basic pipeline to: {pipeline_path}")
-        joblib.dump(pipeline, pipeline_path)
-        # Save individual components for compatibility
         try:
             joblib.dump(pipeline.named_steps['model'], model_path)
             joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
@@ -286,29 +364,45 @@ def run_initial_training():
         except Exception as e:
             log_step(f"⚠️ Failed to save individual components: {e}")
-        # Save basic metadata
         metadata = {
-            "model_version": "v1.0_basic_fallback",
             "model_type": "logistic_regression_pipeline",
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
             "timestamp": datetime.now().isoformat(),
-            "training_method": "basic_fallback",
-            "environment": path_manager.environment
         }
         metadata_path = path_manager.get_metadata_path()
         with open(metadata_path, 'w') as f:
             json.dump(metadata, f, indent=2)
-        log_step(f"✅ Basic training completed successfully")
         log_step(f"   Accuracy: {accuracy:.4f}")
         log_step(f"   F1 Score: {f1:.4f}")
         return True
     except Exception as e:
-        log_step(f"❌ Basic training fallback also failed: {str(e)}")
         return False

 def run_initial_training():
     """Run basic model training"""
+    log_step("Starting initial model training...")
+    try:
+        # Get all the paths
         model_path = path_manager.get_model_file_path()
         vectorizer_path = path_manager.get_vectorizer_path()
         pipeline_path = path_manager.get_pipeline_path()
+        log_step(f"Model path: {model_path}")
+        log_step(f"Vectorizer path: {vectorizer_path}")
+        log_step(f"Pipeline path: {pipeline_path}")
+        # Check if model already exists
+        if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
+            log_step("✅ Model files already exist, checking if pipeline needs to be created...")
+            # If individual components exist but pipeline doesn't, create pipeline
+            if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
+                log_step("Creating pipeline from existing components...")
+                try:
+                    import joblib
+                    from sklearn.pipeline import Pipeline
+                    # Load existing components
+                    model = joblib.load(model_path)
+                    vectorizer = joblib.load(vectorizer_path)
+                    # Create pipeline
+                    pipeline = Pipeline([
+                        ('vectorizer', vectorizer),
+                        ('model', model)
+                    ])
+                    # Save pipeline
+                    joblib.dump(pipeline, pipeline_path)
+                    log_step(f"✅ Created pipeline from existing components: {pipeline_path}")
+                except Exception as e:
+                    log_step(f"⚠️ Failed to create pipeline from existing components: {e}")
+            return True
+        # Import required libraries
         # Load dataset
         dataset_path = path_manager.get_combined_dataset_path()
         if not dataset_path.exists():
             X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
         )
+        # Create pipeline with preprocessing
         pipeline = Pipeline([
             ('vectorizer', TfidfVectorizer(
                 max_features=5000,
         ])
         # Train model with cross-validation
+        log_step("Training model with cross-validation...")
+        # Perform cross-validation before final training
         cv_results = cross_validate(
             pipeline, X_train, y_train,
             cv=3,
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
+        # Save CV results for API access
+        cv_data = {
+            "n_splits": 3,
+            "test_scores": {
+                "accuracy": {
+                    "mean": float(cv_results['test_accuracy'].mean()),
+                    "std": float(cv_results['test_accuracy'].std()),
+                    "scores": cv_results['test_accuracy'].tolist()
+                },
+                "f1": {
+                    "mean": float(cv_results['test_f1_weighted'].mean()),
+                    "std": float(cv_results['test_f1_weighted'].std()),
+                    "scores": cv_results['test_f1_weighted'].tolist()
+                }
+            },
+            "train_scores": {
+                "accuracy": {
+                    "mean": float(cv_results['train_accuracy'].mean()),
+                    "std": float(cv_results['train_accuracy'].std()),
+                    "scores": cv_results['train_accuracy'].tolist()
+                },
+                "f1": {
+                    "mean": float(cv_results['train_f1_weighted'].mean()),
+                    "std": float(cv_results['train_f1_weighted'].std()),
+                    "scores": cv_results['train_f1_weighted'].tolist()
+                }
+            }
+        }
+        # Save CV results to file
+        cv_results_path = path_manager.get_logs_path("cv_results.json")
+        with open(cv_results_path, 'w') as f:
+            json.dump(cv_data, f, indent=2)
+        log_step(f"Saved CV results to: {cv_results_path}")
+        # Ensure model directory exists
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+        # Save complete pipeline FIRST (this is the priority)
+        log_step(f"Saving pipeline to: {pipeline_path}")
+        joblib.dump(pipeline, pipeline_path)
+        # Verify pipeline was saved
+        if pipeline_path.exists():
+            log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
+            # Test loading the pipeline
+            try:
+                test_pipeline = joblib.load(pipeline_path)
+                test_pred = test_pipeline.predict(["This is a test"])
+                log_step(f"✅ Pipeline verification successful: {test_pred}")
+            except Exception as e:
+                log_step(f"⚠️ Pipeline verification failed: {e}")
+        else:
+            log_step(f"❌ Pipeline was not saved to {pipeline_path}")
+        # Save individual components for backward compatibility
         try:
             joblib.dump(pipeline.named_steps['model'], model_path)
             joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
         except Exception as e:
             log_step(f"⚠️ Failed to save individual components: {e}")
+        # Save metadata
         metadata = {
+            "model_version": "v1.0_init",
             "model_type": "logistic_regression_pipeline",
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
+            "train_size": len(X_train),
+            "test_size": len(X_test),
             "timestamp": datetime.now().isoformat(),
+            "training_method": "initialization",
+            "environment": path_manager.environment,
+            "data_path": str(dataset_path),
+            "class_distribution": class_counts.to_dict(),
+            "pipeline_created": pipeline_path.exists(),
+            "individual_components_created": model_path.exists() and vectorizer_path.exists(),
+            # Add CV results to metadata
+            "cv_f1_mean": float(cv_results['test_f1_weighted'].mean()),
+            "cv_f1_std": float(cv_results['test_f1_weighted'].std()),
+            "cv_accuracy_mean": float(cv_results['test_accuracy'].mean()),
+            "cv_accuracy_std": float(cv_results['test_accuracy'].std())
         }
         metadata_path = path_manager.get_metadata_path()
         with open(metadata_path, 'w') as f:
             json.dump(metadata, f, indent=2)
+        log_step(f"✅ Training completed successfully")
         log_step(f"   Accuracy: {accuracy:.4f}")
         log_step(f"   F1 Score: {f1:.4f}")
+        log_step(f"   Pipeline saved: {pipeline_path.exists()}")
+        log_step(f"   Model saved to: {model_path}")
+        log_step(f"   Vectorizer saved to: {vectorizer_path}")
         return True
     except Exception as e:
+        log_step(f"❌ Training failed: {str(e)}")
+        import traceback
+        log_step(f"❌ Traceback: {traceback.format_exc()}")
         return False