Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

4a1bc0d

1 Parent(s): c678ee1

Update initialize_system.py

Browse files

Files changed (1) hide show

initialize_system.py +227 -296

initialize_system.py CHANGED Viewed

@@ -12,42 +12,61 @@ def log_step(message):
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
-def check_model_exists():
-    """Check if trained model already exists"""
-    model_files = [
-        Path("/tmp/pipeline.pkl"),
-        Path("/tmp/model.pkl"),
-        Path("/tmp/vectorizer.pkl"),
-        Path("/tmp/metadata.json")
     ]
-    existing_files = [f for f in model_files if f.exists()]
-    if len(existing_files) >= 2:  # At least pipeline + metadata OR model + vectorizer
-        log_step(f"✅ Found {len(existing_files)} existing model files")
-        return True, existing_files
-    else:
-        log_step(f"❌ Missing model files - only found {len(existing_files)}")
-        return False, existing_files
-def check_training_data_exists():
-    """Check if training data is available"""
-    data_files = [
-        Path("/tmp/data/combined_dataset.csv"),
-        Path("/app/data/combined_dataset.csv"),
-        Path("/tmp/data/kaggle/Fake.csv"),
-        Path("/tmp/data/kaggle/True.csv")
-    ]
-    existing_data = [f for f in data_files if f.exists()]
-    if existing_data:
-        log_step(f"✅ Found training data: {[str(f) for f in existing_data]}")
-        return True, existing_data
     else:
-        log_step("❌ No training data found")
-        return False, []
 def create_directories():
@@ -56,11 +75,8 @@ def create_directories():
     directories = [
         "/tmp/data",
-        "/tmp/data/kaggle",
         "/tmp/model",
-        "/tmp/logs",
-        "/tmp/results",
-        "/tmp/backups"
     ]
     for dir_path in directories:
@@ -74,11 +90,8 @@ def copy_original_datasets():
     source_files = [
         ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
-        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
-        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
-        ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
-        ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
-        ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
     ]
     copied_count = 0
@@ -96,84 +109,70 @@ def copy_original_datasets():
 def create_minimal_dataset():
     """Create a minimal dataset if original doesn't exist"""
-    log_step("Creating minimal dataset...")
     combined_path = Path("/tmp/data/combined_dataset.csv")
     if combined_path.exists():
-        log_step("✅ Combined dataset already exists")
         return True
-    # Create minimal training data with more samples for better training
     minimal_data = pd.DataFrame({
         'text': [
-            # Real news samples
-            'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
-            'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
-            'Local authorities report significant improvements in air quality following new environmental regulations',
-            'Research published in Nature journal shows promising results for renewable energy storage technology',
-            'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
-            'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
-            'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
-            'Transportation department announces infrastructure improvements to major highways across the region',
-            'Educational institutions implement new digital learning platforms to enhance student engagement',
-            'Agricultural studies reveal improved crop yields through sustainable farming practices',
-            'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
-            'Municipal government approves budget for public transportation expansion project in urban areas',
-            'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
-            'International trade agreements show positive impact on local businesses and job creation',
-            'Environmental protection agency releases report on water quality improvements in major rivers',
-            # Fake news samples
-            'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
-            'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
-            'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
-            'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
-            'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
-            'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
-            'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
-            'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
-            'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
-            'WARNING: New technology allows complete thought reading through WiFi signals in your home',
-            'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
-            'UNCOVERED: All news media controlled by single person living in secret underground bunker',
-            'PROOF: Time travel already exists but only available to wealthy elite who control world events',
-            'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
-            'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
         ],
-        'label': [
-            # Real news labels (0)
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            # Fake news labels (1)
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-        ]
     })
     minimal_data.to_csv(combined_path, index=False)
-    log_step(f"✅ Created enhanced minimal dataset with {len(minimal_data)} samples")
-    log_step(f"   - Real news samples: {sum(minimal_data['label'] == 0)}")
-    log_step(f"   - Fake news samples: {sum(minimal_data['label'] == 1)}")
     return True
-def run_initial_training():
-    """Run comprehensive model training for first-time setup"""
-    log_step("🚀 Starting comprehensive model training for first-time setup...")
     try:
-        # Import training modules
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
-        from sklearn.ensemble import RandomForestClassifier
-        from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
         from sklearn.pipeline import Pipeline
-        from sklearn.feature_selection import SelectKBest, chi2
         from sklearn.preprocessing import FunctionTransformer
-        from sklearn.metrics import accuracy_score, f1_score, classification_report
         import joblib
         import re
-        # Text preprocessing function (same as in train.py)
         def preprocess_text_function(texts):
             def clean_single_text(text):
                 text = str(text)
@@ -200,135 +199,112 @@ def run_initial_training():
         df = pd.read_csv(dataset_path)
         log_step(f"📊 Loaded dataset with {len(df)} samples")
-        # Data validation and cleaning
-        df = df.dropna(subset=['text', 'label'])
-        df = df[df['text'].astype(str).str.len() > 10]
-        log_step(f"📊 After cleaning: {len(df)} samples")
-        log_step(f"📊 Class distribution: {df['label'].value_counts().to_dict()}")
         # Prepare data
         X = df['text'].values
         y = df['label'].values
         # Train-test split
         X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42, stratify=y
         )
         log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
-        # Create comprehensive pipeline
         text_preprocessor = FunctionTransformer(
             func=preprocess_text_function,
             validate=False
         )
-        vectorizer = TfidfVectorizer(
-            max_features=5000,
-            min_df=1,
-            max_df=0.95,
-            ngram_range=(1, 2),
-            stop_words='english',
-            sublinear_tf=True,
-            norm='l2'
-        )
-        feature_selector = SelectKBest(
-            score_func=chi2,
-            k=2000
-        )
-        # Create pipeline with Logistic Regression
         pipeline = Pipeline([
             ('preprocess', text_preprocessor),
-            ('vectorize', vectorizer),
-            ('feature_select', feature_selector),
-            ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
         ])
-        log_step("🔧 Training model with optimized pipeline...")
-        # Hyperparameter tuning for datasets with sufficient samples
-        if len(X_train) >= 20:
-            log_step("⚙️ Performing hyperparameter tuning...")
-            param_grid = {
-                'model__C': [0.1, 1, 10],
-                'model__penalty': ['l2']
-            }
-            cv_folds = max(2, min(3, len(X_train) // 10))
-            grid_search = GridSearchCV(
-                pipeline,
-                param_grid,
-                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
-                scoring='f1_weighted',
-                n_jobs=1
-            )
-            grid_search.fit(X_train, y_train)
-            best_pipeline = grid_search.best_estimator_
-            log_step(f"✅ Best parameters: {grid_search.best_params_}")
-            log_step(f"✅ Best CV score: {grid_search.best_score_:.4f}")
-        else:
-            log_step("⚙️ Using simple training for small dataset...")
-            pipeline.fit(X_train, y_train)
-            best_pipeline = pipeline
-        # Evaluate model
-        y_pred = best_pipeline.predict(X_test)
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
-        log_step(f"📈 Model Performance:")
-        log_step(f"   - Accuracy: {accuracy:.4f}")
-        log_step(f"   - F1 Score: {f1:.4f}")
-        # Save model artifacts
         log_step("💾 Saving model artifacts...")
-        # Save the complete pipeline
-        joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
         log_step("✅ Saved complete pipeline")
-        # Save individual components for compatibility
-        joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
-        joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
-        log_step("✅ Saved individual model components")
         # Generate comprehensive metadata
         metadata = {
             "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
-            "model_type": "logistic_regression",
-            "training_method": "initial_setup",
-            "dataset_size": len(df),
-            "train_size": len(X_train),
-            "test_size": len(X_test),
             "test_accuracy": float(accuracy),
             "test_f1": float(f1),
-            "hyperparameter_tuning": len(X_train) >= 20,
-            "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
-            "class_distribution": df['label'].value_counts().to_dict(),
-            "training_config": {
                 "max_features": 5000,
                 "ngram_range": [1, 2],
-                "feature_selection_k": 2000,
-                "test_size": 0.2
             },
-            "timestamp": datetime.now().isoformat(),
-            "initialization_notes": "Model trained during system initialization",
-            "ready_for_production": True
         }
-        # Save metadata
         with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
         log_step("✅ Saved comprehensive metadata")
-        log_step(f"🎉 Initial model training completed successfully!")
-        log_step(f"📊 Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
         return True
     except Exception as e:
@@ -346,7 +322,7 @@ def create_initial_logs():
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
-            "event": "System initialized successfully with trained model",
             "level": "INFO"
         }]
@@ -354,16 +330,9 @@ def create_initial_logs():
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
-        log_dirs = ["/tmp/logs"]
-        for log_dir in log_dirs:
-            Path(log_dir).mkdir(parents=True, exist_ok=True)
         with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
-        with open("/tmp/logs/scheduler_execution.json", 'w') as f:
-            json.dump([], f)
         log_step("✅ Initial log files created")
         return True
@@ -372,136 +341,98 @@ def create_initial_logs():
         return False
-def validate_installation():
-    """Validate that the system is properly set up"""
-    log_step("🔍 Validating system installation...")
-    validation_checks = []
-    # Check model files
-    model_exists, model_files = check_model_exists()
-    validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
-    # Check data files
-    data_exists, data_files = check_training_data_exists()
-    validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
-    # Check directories
-    required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
-    dirs_exist = all(Path(d).exists() for d in required_dirs)
-    validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
-    # Check logs
-    log_exists = Path("/tmp/activity_log.json").exists()
-    validation_checks.append(("Log Files", log_exists, "Activity log created"))
-    # Test model loading
-    model_loadable = False
-    try:
-        import joblib
-        pipeline = joblib.load("/tmp/pipeline.pkl")
-        test_prediction = pipeline.predict(["This is a test news article"])
-        model_loadable = True
-        validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
-    except Exception as e:
-        validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
-    # Print validation results
-    log_step("📋 Validation Results:")
-    all_passed = True
-    for check_name, passed, details in validation_checks:
-        status = "✅ PASS" if passed else "❌ FAIL"
-        log_step(f"   {status} {check_name}: {details}")
-        if not passed:
-            all_passed = False
-    return all_passed, validation_checks
 def main():
-    """Main initialization function with smart training logic"""
-    log_step("🚀 Starting intelligent system initialization...")
-    # Check if model already exists
-    model_exists, existing_model_files = check_model_exists()
-    if model_exists:
-        log_step("🎯 EXISTING INSTALLATION DETECTED")
-        log_step("📄 Found existing model files - skipping training")
-        # Load existing metadata to show info
-        try:
-            with open("/tmp/metadata.json", 'r') as f:
-                metadata = json.load(f)
-            log_step(f"📊 Existing Model Info:")
-            log_step(f"   - Version: {metadata.get('model_version', 'Unknown')}")
-            log_step(f"   - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
-            log_step(f"   - F1 Score: {metadata.get('test_f1', 'Unknown')}")
-            log_step(f"   - Created: {metadata.get('timestamp', 'Unknown')}")
-        except Exception as e:
-            log_step(f"⚠️ Could not read existing metadata: {e}")
     else:
-        log_step("🆕 FIRST-TIME INSTALLATION DETECTED")
-        log_step("🔧 No existing model found - will train new model")
-    # Run initialization steps
-    steps = [
-        ("Directory Creation", create_directories),
-        ("Dataset Copy", copy_original_datasets),
-        ("Dataset Preparation", create_minimal_dataset),
-        ("Log Creation", create_initial_logs)
-    ]
-    # Add training step only if model doesn't exist
-    if not model_exists:
-        steps.insert(-1, ("🤖 Model Training", run_initial_training))
     failed_steps = []
-    for step_name, step_function in steps:
         try:
-            log_step(f"▶️ Starting: {step_name}")
             if step_function():
-                log_step(f"✅ {step_name} completed")
             else:
                 log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
         except Exception as e:
-            log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
-    # Final validation
-    log_step("🔍 Running final system validation...")
-    validation_passed, validation_results = validate_installation()
-    # Summary
-    log_step("=" * 60)
     if failed_steps:
-        log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
-        log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
-    else:
-        log_step("🎉 System initialization completed successfully!")
-    if validation_passed:
-        log_step("✅ All validation checks passed!")
-        log_step("🚀 System is ready for use!")
-        if not model_exists:
-            log_step("🤖 NEW MODEL TRAINED AND READY")
-            log_step("📊 You can now start making predictions!")
         else:
-            log_step("🔄 EXISTING MODEL VALIDATED AND READY")
-            log_step("📊 System restored from previous installation!")
     else:
-        log_step("❌ Some validation checks failed")
-        log_step("🔧 Manual intervention may be required")
-    log_step("=" * 60)
 if __name__ == "__main__":
     main()

     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+def check_existing_model():
+    """Check if a complete model setup already exists"""
+    log_step("Checking for existing model setup...")
+    critical_files = [
+        "/tmp/model.pkl",
+        "/tmp/vectorizer.pkl",
+        "/tmp/metadata.json"
     ]
+    # Check if all critical files exist
+    existing_files = []
+    missing_files = []
+    for file_path in critical_files:
+        if Path(file_path).exists():
+            existing_files.append(file_path)
+        else:
+            missing_files.append(file_path)
+    # Also check for pipeline (new format)
+    pipeline_path = Path("/tmp/pipeline.pkl")
+    if pipeline_path.exists():
+        existing_files.append(str(pipeline_path))
+    if len(existing_files) >= 2:  # At least model + vectorizer OR pipeline + metadata
+        log_step(f"✅ Found existing model setup: {len(existing_files)} files")
+        for file_path in existing_files:
+            file_size = Path(file_path).stat().st_size if Path(file_path).exists() else 0
+            log_step(f"   📁 {file_path} ({file_size:,} bytes)")
+        # Check if metadata shows when it was last trained
+        try:
+            metadata_path = Path("/tmp/metadata.json")
+            if metadata_path.exists():
+                with open(metadata_path, 'r') as f:
+                    metadata = json.load(f)
+                last_trained = metadata.get('timestamp', 'Unknown')
+                model_version = metadata.get('model_version', 'Unknown')
+                accuracy = metadata.get('test_accuracy', 'Unknown')
+                log_step(f"   🎯 Model Version: {model_version}")
+                log_step(f"   📊 Accuracy: {accuracy}")
+                log_step(f"   🕒 Last Trained: {last_trained}")
+        except Exception as e:
+            log_step(f"   ⚠️ Could not read metadata: {e}")
+        return True
     else:
+        log_step(f"❌ Incomplete model setup found")
+        log_step(f"   Existing: {existing_files}")
+        log_step(f"   Missing: {missing_files}")
+        return False
 def create_directories():
     directories = [
         "/tmp/data",
         "/tmp/model",
+        "/tmp/logs"
     ]
     for dir_path in directories:
     source_files = [
         ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
+        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
+        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
     ]
     copied_count = 0
 def create_minimal_dataset():
     """Create a minimal dataset if original doesn't exist"""
+    log_step("Checking for training dataset...")
     combined_path = Path("/tmp/data/combined_dataset.csv")
     if combined_path.exists():
+        # Check dataset size
+        df = pd.read_csv(combined_path)
+        log_step(f"✅ Found existing dataset with {len(df)} samples")
         return True
+    log_step("Creating minimal fallback dataset...")
+    # Create minimal training data with better examples
     minimal_data = pd.DataFrame({
         'text': [
+            # Real news examples
+            'Scientists at MIT develop new renewable energy technology that could revolutionize solar power generation',
+            'Federal Reserve announces interest rate decision following economic data review by board members',
+            'Local hospital receives grant funding to expand emergency care services for rural communities',
+            'University researchers publish peer-reviewed study on climate change impact in Nature journal',
+            'City council approves new infrastructure project to improve public transportation accessibility',
+            'Technology company reports quarterly earnings beating analyst expectations amid market uncertainty',
+            'International health organization releases guidelines for pandemic preparedness protocols',
+            'Archaeological team discovers ancient artifacts providing insights into historical civilization',
+            'Education department announces new funding for STEM programs in underserved school districts',
+            'Environmental agency implements new regulations to protect endangered species habitats',
+            # Fake news examples
+            'SHOCKING: Government admits to hiding alien technology in secret underground military bases',
+            'BREAKING: Miracle cure discovered that doctors dont want you to know about eliminates all diseases',
+            'EXCLUSIVE: Celebrity reveals how eating this one weird fruit helped them lose 50 pounds overnight',
+            'URGENT: New world order conspiracy exposed through leaked documents from anonymous whistleblower',
+            'ALERT: Scientists confirm that 5G towers are controlling peoples minds through radio frequencies',
+            'REVEALED: Ancient pyramid discovered in Antarctica proves existence of lost advanced civilization',
+            'WARNING: Vaccination campaign is actually secret government plot to implant tracking microchips',
+            'EXPOSED: Time travel technology has been perfected by shadow government organization since 1960s',
+            'CONFIRMED: Flat earth society presents undeniable proof that NASA has been lying about space',
+            'INCREDIBLE: Man discovers how to predict lottery numbers using this simple mathematical formula'
         ],
+        'label': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # Real news (first 10)
+                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]   # Fake news (last 10)
     })
     minimal_data.to_csv(combined_path, index=False)
+    log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples")
     return True
+def run_comprehensive_training():
+    """Run comprehensive model training with pipeline"""
+    log_step("🚀 Starting comprehensive model training...")
     try:
+        # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
         from sklearn.pipeline import Pipeline
+        from sklearn.model_selection import train_test_split
+        from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
         from sklearn.preprocessing import FunctionTransformer
         import joblib
         import re
+        # Text preprocessing function
         def preprocess_text_function(texts):
             def clean_single_text(text):
                 text = str(text)
         df = pd.read_csv(dataset_path)
         log_step(f"📊 Loaded dataset with {len(df)} samples")
         # Prepare data
         X = df['text'].values
         y = df['label'].values
+        # Check class distribution
+        unique, counts = np.unique(y, return_counts=True)
+        log_step(f"📈 Class distribution: {dict(zip(unique, counts))}")
         # Train-test split
+        test_size = 0.2 if len(df) > 20 else 0.1
         X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42,
+            stratify=y if len(np.unique(y)) > 1 else None
         )
         log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
+        # Create preprocessing pipeline
         text_preprocessor = FunctionTransformer(
             func=preprocess_text_function,
             validate=False
         )
+        # Create comprehensive pipeline
         pipeline = Pipeline([
             ('preprocess', text_preprocessor),
+            ('vectorize', TfidfVectorizer(
+                max_features=5000,
+                min_df=1,
+                max_df=0.95,
+                ngram_range=(1, 2),
+                stop_words='english',
+                sublinear_tf=True,
+                norm='l2'
+            )),
+            ('model', LogisticRegression(
+                max_iter=1000,
+                random_state=42,
+                class_weight='balanced'
+            ))
         ])
+        log_step("🔧 Training pipeline...")
+        pipeline.fit(X_train, y_train)
+        # Evaluate
+        y_pred = pipeline.predict(X_test)
+        # Calculate comprehensive metrics
         accuracy = accuracy_score(y_test, y_pred)
+        precision = precision_score(y_test, y_pred, average='weighted')
+        recall = recall_score(y_test, y_pred, average='weighted')
         f1 = f1_score(y_test, y_pred, average='weighted')
+        log_step(f"📊 Model Performance:")
+        log_step(f"   Accuracy:  {accuracy:.4f}")
+        log_step(f"   Precision: {precision:.4f}")
+        log_step(f"   Recall:    {recall:.4f}")
+        log_step(f"   F1 Score:  {f1:.4f}")
+        # Save comprehensive model setup
         log_step("💾 Saving model artifacts...")
+        # Save complete pipeline
+        joblib.dump(pipeline, "/tmp/pipeline.pkl")
         log_step("✅ Saved complete pipeline")
+        # Save individual components for backward compatibility
+        joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
+        joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
+        log_step("✅ Saved individual components")
         # Generate comprehensive metadata
         metadata = {
             "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            "model_type": "logistic_regression_pipeline",
             "test_accuracy": float(accuracy),
+            "test_precision": float(precision),
+            "test_recall": float(recall),
             "test_f1": float(f1),
+            "train_size": len(X_train),
+            "test_size": len(X_test),
+            "dataset_size": len(df),
+            "timestamp": datetime.now().isoformat(),
+            "training_method": "comprehensive_initialization",
+            "pipeline_components": ["preprocess", "vectorize", "model"],
+            "vectorizer_config": {
                 "max_features": 5000,
                 "ngram_range": [1, 2],
+                "stop_words": "english"
             },
+            "model_config": {
+                "algorithm": "LogisticRegression",
+                "max_iter": 1000,
+                "class_weight": "balanced"
+            }
         }
         with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
         log_step("✅ Saved comprehensive metadata")
+        log_step(f"🎉 Training completed successfully!")
+        log_step(f"   Final accuracy: {accuracy:.4f}")
+        log_step(f"   Model ready for production use")
         return True
     except Exception as e:
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
+            "event": "System initialized successfully",
             "level": "INFO"
         }]
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
         with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
         log_step("✅ Initial log files created")
         return True
         return False
 def main():
+    """Main initialization function with smart model training"""
+    log_step("🚀 Starting smart system initialization...")
+    # First, check if we already have a working model
+    has_existing_model = check_existing_model()
+    # Define steps based on whether model exists
+    if has_existing_model:
+        log_step("🎯 Existing model detected - skipping training")
+        steps = [
+            ("Directory Creation", create_directories),
+            ("Dataset Copy", copy_original_datasets),
+            ("Dataset Validation", create_minimal_dataset),
+            ("Log Creation", create_initial_logs)
+        ]
     else:
+        log_step("🆕 No existing model - will perform first-time setup with training")
+        steps = [
+            ("Directory Creation", create_directories),
+            ("Dataset Copy", copy_original_datasets),
+            ("Dataset Preparation", create_minimal_dataset),
+            ("Model Training", run_comprehensive_training),
+            ("Log Creation", create_initial_logs)
+        ]
     failed_steps = []
+    total_steps = len(steps)
+    for i, (step_name, step_function) in enumerate(steps, 1):
+        log_step(f"📋 Step {i}/{total_steps}: {step_name}")
         try:
             if step_function():
+                log_step(f"✅ {step_name} completed successfully")
             else:
                 log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
         except Exception as e:
+            log_step(f"❌ {step_name} failed with exception: {str(e)}")
             failed_steps.append(step_name)
+    # Final summary
+    log_step("=" * 50)
     if failed_steps:
+        log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
+        log_step(f"Failed steps: {', '.join(failed_steps)}")
+        # Check if critical components are still available
+        if check_existing_model():
+            log_step("✅ Critical model components are available despite some failures")
         else:
+            log_step("❌ Critical model components are missing - system may not work properly")
     else:
+        if has_existing_model:
+            log_step("🎉 System initialization completed successfully!")
+            log_step("🚀 Existing model loaded - system ready for immediate use!")
+        else:
+            log_step("🎉 First-time setup completed successfully!")
+            log_step("🚀 Model trained and system ready for use!")
+    # Final status check
+    log_step("📊 Final System Status:")
+    critical_files = [
+        ("/tmp/pipeline.pkl", "Complete Pipeline"),
+        ("/tmp/model.pkl", "Model Component"),
+        ("/tmp/vectorizer.pkl", "Vectorizer Component"),
+        ("/tmp/metadata.json", "Model Metadata"),
+        ("/tmp/data/combined_dataset.csv", "Training Dataset")
+    ]
+    ready_count = 0
+    for file_path, description in critical_files:
+        if Path(file_path).exists():
+            file_size = Path(file_path).stat().st_size
+            log_step(f"   ✅ {description}: {file_size:,} bytes")
+            ready_count += 1
+        else:
+            log_step(f"   ❌ {description}: Missing")
+    log_step(f"📈 System Readiness: {ready_count}/{len(critical_files)} components available")
+    if ready_count >= 3:  # At least model + vectorizer + metadata OR pipeline + metadata
+        log_step("🎯 System is ready for production use!")
+    else:
+        log_step("⚠️ System setup incomplete - may require manual intervention")
+    log_step("=" * 50)
 if __name__ == "__main__":
+    # Add numpy import for the training function
+    import numpy as np
     main()