Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

5bb1d1a

1 Parent(s): 4a1bc0d

Update initialize_system.py

Browse files

Files changed (1) hide show

initialize_system.py +296 -227

initialize_system.py CHANGED Viewed

@@ -12,61 +12,42 @@ def log_step(message):
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
-def check_existing_model():
-    """Check if a complete model setup already exists"""
-    log_step("Checking for existing model setup...")
-    critical_files = [
-        "/tmp/model.pkl",
-        "/tmp/vectorizer.pkl",
-        "/tmp/metadata.json"
     ]
-    # Check if all critical files exist
-    existing_files = []
-    missing_files = []
-    for file_path in critical_files:
-        if Path(file_path).exists():
-            existing_files.append(file_path)
-        else:
-            missing_files.append(file_path)
-    # Also check for pipeline (new format)
-    pipeline_path = Path("/tmp/pipeline.pkl")
-    if pipeline_path.exists():
-        existing_files.append(str(pipeline_path))
-    if len(existing_files) >= 2:  # At least model + vectorizer OR pipeline + metadata
-        log_step(f"✅ Found existing model setup: {len(existing_files)} files")
-        for file_path in existing_files:
-            file_size = Path(file_path).stat().st_size if Path(file_path).exists() else 0
-            log_step(f"   📁 {file_path} ({file_size:,} bytes)")
-        # Check if metadata shows when it was last trained
-        try:
-            metadata_path = Path("/tmp/metadata.json")
-            if metadata_path.exists():
-                with open(metadata_path, 'r') as f:
-                    metadata = json.load(f)
-                last_trained = metadata.get('timestamp', 'Unknown')
-                model_version = metadata.get('model_version', 'Unknown')
-                accuracy = metadata.get('test_accuracy', 'Unknown')
-                log_step(f"   🎯 Model Version: {model_version}")
-                log_step(f"   📊 Accuracy: {accuracy}")
-                log_step(f"   🕒 Last Trained: {last_trained}")
-        except Exception as e:
-            log_step(f"   ⚠️ Could not read metadata: {e}")
-        return True
     else:
-        log_step(f"❌ Incomplete model setup found")
-        log_step(f"   Existing: {existing_files}")
-        log_step(f"   Missing: {missing_files}")
-        return False
 def create_directories():
@@ -75,8 +56,11 @@ def create_directories():
     directories = [
         "/tmp/data",
         "/tmp/model",
-        "/tmp/logs"
     ]
     for dir_path in directories:
@@ -90,8 +74,11 @@ def copy_original_datasets():
     source_files = [
         ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
-        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
-        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
     ]
     copied_count = 0
@@ -109,70 +96,84 @@ def copy_original_datasets():
 def create_minimal_dataset():
     """Create a minimal dataset if original doesn't exist"""
-    log_step("Checking for training dataset...")
     combined_path = Path("/tmp/data/combined_dataset.csv")
     if combined_path.exists():
-        # Check dataset size
-        df = pd.read_csv(combined_path)
-        log_step(f"✅ Found existing dataset with {len(df)} samples")
         return True
-    log_step("Creating minimal fallback dataset...")
-    # Create minimal training data with better examples
     minimal_data = pd.DataFrame({
         'text': [
-            # Real news examples
-            'Scientists at MIT develop new renewable energy technology that could revolutionize solar power generation',
-            'Federal Reserve announces interest rate decision following economic data review by board members',
-            'Local hospital receives grant funding to expand emergency care services for rural communities',
-            'University researchers publish peer-reviewed study on climate change impact in Nature journal',
-            'City council approves new infrastructure project to improve public transportation accessibility',
-            'Technology company reports quarterly earnings beating analyst expectations amid market uncertainty',
-            'International health organization releases guidelines for pandemic preparedness protocols',
-            'Archaeological team discovers ancient artifacts providing insights into historical civilization',
-            'Education department announces new funding for STEM programs in underserved school districts',
-            'Environmental agency implements new regulations to protect endangered species habitats',
-            # Fake news examples
-            'SHOCKING: Government admits to hiding alien technology in secret underground military bases',
-            'BREAKING: Miracle cure discovered that doctors dont want you to know about eliminates all diseases',
-            'EXCLUSIVE: Celebrity reveals how eating this one weird fruit helped them lose 50 pounds overnight',
-            'URGENT: New world order conspiracy exposed through leaked documents from anonymous whistleblower',
-            'ALERT: Scientists confirm that 5G towers are controlling peoples minds through radio frequencies',
-            'REVEALED: Ancient pyramid discovered in Antarctica proves existence of lost advanced civilization',
-            'WARNING: Vaccination campaign is actually secret government plot to implant tracking microchips',
-            'EXPOSED: Time travel technology has been perfected by shadow government organization since 1960s',
-            'CONFIRMED: Flat earth society presents undeniable proof that NASA has been lying about space',
-            'INCREDIBLE: Man discovers how to predict lottery numbers using this simple mathematical formula'
         ],
-        'label': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # Real news (first 10)
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]   # Fake news (last 10)
     })
     minimal_data.to_csv(combined_path, index=False)
-    log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples")
     return True
-def run_comprehensive_training():
-    """Run comprehensive model training with pipeline"""
-    log_step("🚀 Starting comprehensive model training...")
     try:
-        # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
         from sklearn.pipeline import Pipeline
-        from sklearn.model_selection import train_test_split
-        from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
         from sklearn.preprocessing import FunctionTransformer
         import joblib
         import re
-        # Text preprocessing function
         def preprocess_text_function(texts):
             def clean_single_text(text):
                 text = str(text)
@@ -199,112 +200,135 @@ def run_comprehensive_training():
         df = pd.read_csv(dataset_path)
         log_step(f"📊 Loaded dataset with {len(df)} samples")
         # Prepare data
         X = df['text'].values
         y = df['label'].values
-        # Check class distribution
-        unique, counts = np.unique(y, return_counts=True)
-        log_step(f"📈 Class distribution: {dict(zip(unique, counts))}")
         # Train-test split
-        test_size = 0.2 if len(df) > 20 else 0.1
         X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=test_size, random_state=42,
-            stratify=y if len(np.unique(y)) > 1 else None
         )
         log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
-        # Create preprocessing pipeline
         text_preprocessor = FunctionTransformer(
             func=preprocess_text_function,
             validate=False
         )
-        # Create comprehensive pipeline
         pipeline = Pipeline([
             ('preprocess', text_preprocessor),
-            ('vectorize', TfidfVectorizer(
-                max_features=5000,
-                min_df=1,
-                max_df=0.95,
-                ngram_range=(1, 2),
-                stop_words='english',
-                sublinear_tf=True,
-                norm='l2'
-            )),
-            ('model', LogisticRegression(
-                max_iter=1000,
-                random_state=42,
-                class_weight='balanced'
-            ))
         ])
-        log_step("🔧 Training pipeline...")
-        pipeline.fit(X_train, y_train)
-        # Evaluate
-        y_pred = pipeline.predict(X_test)
-        # Calculate comprehensive metrics
         accuracy = accuracy_score(y_test, y_pred)
-        precision = precision_score(y_test, y_pred, average='weighted')
-        recall = recall_score(y_test, y_pred, average='weighted')
         f1 = f1_score(y_test, y_pred, average='weighted')
-        log_step(f"📊 Model Performance:")
-        log_step(f"   Accuracy:  {accuracy:.4f}")
-        log_step(f"   Precision: {precision:.4f}")
-        log_step(f"   Recall:    {recall:.4f}")
-        log_step(f"   F1 Score:  {f1:.4f}")
-        # Save comprehensive model setup
         log_step("💾 Saving model artifacts...")
-        # Save complete pipeline
-        joblib.dump(pipeline, "/tmp/pipeline.pkl")
         log_step("✅ Saved complete pipeline")
-        # Save individual components for backward compatibility
-        joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
-        joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
-        log_step("✅ Saved individual components")
         # Generate comprehensive metadata
         metadata = {
             "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
-            "model_type": "logistic_regression_pipeline",
-            "test_accuracy": float(accuracy),
-            "test_precision": float(precision),
-            "test_recall": float(recall),
-            "test_f1": float(f1),
             "train_size": len(X_train),
             "test_size": len(X_test),
-            "dataset_size": len(df),
-            "timestamp": datetime.now().isoformat(),
-            "training_method": "comprehensive_initialization",
-            "pipeline_components": ["preprocess", "vectorize", "model"],
-            "vectorizer_config": {
                 "max_features": 5000,
                 "ngram_range": [1, 2],
-                "stop_words": "english"
             },
-            "model_config": {
-                "algorithm": "LogisticRegression",
-                "max_iter": 1000,
-                "class_weight": "balanced"
-            }
         }
         with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
         log_step("✅ Saved comprehensive metadata")
-        log_step(f"🎉 Training completed successfully!")
-        log_step(f"   Final accuracy: {accuracy:.4f}")
-        log_step(f"   Model ready for production use")
         return True
     except Exception as e:
@@ -322,7 +346,7 @@ def create_initial_logs():
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
-            "event": "System initialized successfully",
             "level": "INFO"
         }]
@@ -330,9 +354,16 @@ def create_initial_logs():
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
         with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
         log_step("✅ Initial log files created")
         return True
@@ -341,98 +372,136 @@ def create_initial_logs():
         return False
 def main():
-    """Main initialization function with smart model training"""
-    log_step("🚀 Starting smart system initialization...")
-    # First, check if we already have a working model
-    has_existing_model = check_existing_model()
-    # Define steps based on whether model exists
-    if has_existing_model:
-        log_step("🎯 Existing model detected - skipping training")
-        steps = [
-            ("Directory Creation", create_directories),
-            ("Dataset Copy", copy_original_datasets),
-            ("Dataset Validation", create_minimal_dataset),
-            ("Log Creation", create_initial_logs)
-        ]
     else:
-        log_step("🆕 No existing model - will perform first-time setup with training")
-        steps = [
-            ("Directory Creation", create_directories),
-            ("Dataset Copy", copy_original_datasets),
-            ("Dataset Preparation", create_minimal_dataset),
-            ("Model Training", run_comprehensive_training),
-            ("Log Creation", create_initial_logs)
-        ]
     failed_steps = []
-    total_steps = len(steps)
-    for i, (step_name, step_function) in enumerate(steps, 1):
-        log_step(f"📋 Step {i}/{total_steps}: {step_name}")
         try:
             if step_function():
-                log_step(f"✅ {step_name} completed successfully")
             else:
                 log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
         except Exception as e:
-            log_step(f"❌ {step_name} failed with exception: {str(e)}")
             failed_steps.append(step_name)
-    # Final summary
-    log_step("=" * 50)
     if failed_steps:
-        log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
-        log_step(f"Failed steps: {', '.join(failed_steps)}")
-        # Check if critical components are still available
-        if check_existing_model():
-            log_step("✅ Critical model components are available despite some failures")
         else:
-            log_step("❌ Critical model components are missing - system may not work properly")
     else:
-        if has_existing_model:
-            log_step("🎉 System initialization completed successfully!")
-            log_step("🚀 Existing model loaded - system ready for immediate use!")
-        else:
-            log_step("🎉 First-time setup completed successfully!")
-            log_step("🚀 Model trained and system ready for use!")
-    # Final status check
-    log_step("📊 Final System Status:")
-    critical_files = [
-        ("/tmp/pipeline.pkl", "Complete Pipeline"),
-        ("/tmp/model.pkl", "Model Component"),
-        ("/tmp/vectorizer.pkl", "Vectorizer Component"),
-        ("/tmp/metadata.json", "Model Metadata"),
-        ("/tmp/data/combined_dataset.csv", "Training Dataset")
-    ]
-    ready_count = 0
-    for file_path, description in critical_files:
-        if Path(file_path).exists():
-            file_size = Path(file_path).stat().st_size
-            log_step(f"   ✅ {description}: {file_size:,} bytes")
-            ready_count += 1
-        else:
-            log_step(f"   ❌ {description}: Missing")
-    log_step(f"📈 System Readiness: {ready_count}/{len(critical_files)} components available")
-    if ready_count >= 3:  # At least model + vectorizer + metadata OR pipeline + metadata
-        log_step("🎯 System is ready for production use!")
-    else:
-        log_step("⚠️ System setup incomplete - may require manual intervention")
-    log_step("=" * 50)
 if __name__ == "__main__":
-    # Add numpy import for the training function
-    import numpy as np
     main()

     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+def check_model_exists():
+    """Check if trained model already exists"""
+    model_files = [
+        Path("/tmp/pipeline.pkl"),
+        Path("/tmp/model.pkl"),
+        Path("/tmp/vectorizer.pkl"),
+        Path("/tmp/metadata.json")
     ]
+    existing_files = [f for f in model_files if f.exists()]
+    if len(existing_files) >= 2:  # At least pipeline + metadata OR model + vectorizer
+        log_step(f"✅ Found {len(existing_files)} existing model files")
+        return True, existing_files
+    else:
+        log_step(f"❌ Missing model files - only found {len(existing_files)}")
+        return False, existing_files
+def check_training_data_exists():
+    """Check if training data is available"""
+    data_files = [
+        Path("/tmp/data/combined_dataset.csv"),
+        Path("/app/data/combined_dataset.csv"),
+        Path("/tmp/data/kaggle/Fake.csv"),
+        Path("/tmp/data/kaggle/True.csv")
+    ]
+    existing_data = [f for f in data_files if f.exists()]
+    if existing_data:
+        log_step(f"✅ Found training data: {[str(f) for f in existing_data]}")
+        return True, existing_data
     else:
+        log_step("❌ No training data found")
+        return False, []
 def create_directories():
     directories = [
         "/tmp/data",
+        "/tmp/data/kaggle",
         "/tmp/model",
+        "/tmp/logs",
+        "/tmp/results",
+        "/tmp/backups"
     ]
     for dir_path in directories:
     source_files = [
         ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
+        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
+        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
+        ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
+        ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
+        ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
     ]
     copied_count = 0
 def create_minimal_dataset():
     """Create a minimal dataset if original doesn't exist"""
+    log_step("Creating minimal dataset...")
     combined_path = Path("/tmp/data/combined_dataset.csv")
     if combined_path.exists():
+        log_step("✅ Combined dataset already exists")
         return True
+    # Create minimal training data with more samples for better training
     minimal_data = pd.DataFrame({
         'text': [
+            # Real news samples
+            'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
+            'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
+            'Local authorities report significant improvements in air quality following new environmental regulations',
+            'Research published in Nature journal shows promising results for renewable energy storage technology',
+            'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
+            'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
+            'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
+            'Transportation department announces infrastructure improvements to major highways across the region',
+            'Educational institutions implement new digital learning platforms to enhance student engagement',
+            'Agricultural studies reveal improved crop yields through sustainable farming practices',
+            'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
+            'Municipal government approves budget for public transportation expansion project in urban areas',
+            'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
+            'International trade agreements show positive impact on local businesses and job creation',
+            'Environmental protection agency releases report on water quality improvements in major rivers',
+            # Fake news samples
+            'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
+            'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
+            'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
+            'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
+            'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
+            'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
+            'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
+            'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
+            'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
+            'WARNING: New technology allows complete thought reading through WiFi signals in your home',
+            'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
+            'UNCOVERED: All news media controlled by single person living in secret underground bunker',
+            'PROOF: Time travel already exists but only available to wealthy elite who control world events',
+            'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
+            'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
         ],
+        'label': [
+            # Real news labels (0)
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            # Fake news labels (1)
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+        ]
     })
     minimal_data.to_csv(combined_path, index=False)
+    log_step(f"✅ Created enhanced minimal dataset with {len(minimal_data)} samples")
+    log_step(f"   - Real news samples: {sum(minimal_data['label'] == 0)}")
+    log_step(f"   - Fake news samples: {sum(minimal_data['label'] == 1)}")
     return True
+def run_initial_training():
+    """Run comprehensive model training for first-time setup"""
+    log_step("🚀 Starting comprehensive model training for first-time setup...")
     try:
+        # Import training modules
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
+        from sklearn.ensemble import RandomForestClassifier
+        from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
         from sklearn.pipeline import Pipeline
+        from sklearn.feature_selection import SelectKBest, chi2
         from sklearn.preprocessing import FunctionTransformer
+        from sklearn.metrics import accuracy_score, f1_score, classification_report
         import joblib
         import re
+        # Text preprocessing function (same as in train.py)
         def preprocess_text_function(texts):
             def clean_single_text(text):
                 text = str(text)
         df = pd.read_csv(dataset_path)
         log_step(f"📊 Loaded dataset with {len(df)} samples")
+        # Data validation and cleaning
+        df = df.dropna(subset=['text', 'label'])
+        df = df[df['text'].astype(str).str.len() > 10]
+        log_step(f"📊 After cleaning: {len(df)} samples")
+        log_step(f"📊 Class distribution: {df['label'].value_counts().to_dict()}")
         # Prepare data
         X = df['text'].values
         y = df['label'].values
         # Train-test split
         X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
         )
         log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")
+        # Create comprehensive pipeline
         text_preprocessor = FunctionTransformer(
             func=preprocess_text_function,
             validate=False
         )
+        vectorizer = TfidfVectorizer(
+            max_features=5000,
+            min_df=1,
+            max_df=0.95,
+            ngram_range=(1, 2),
+            stop_words='english',
+            sublinear_tf=True,
+            norm='l2'
+        )
+        feature_selector = SelectKBest(
+            score_func=chi2,
+            k=2000
+        )
+        # Create pipeline with Logistic Regression
         pipeline = Pipeline([
             ('preprocess', text_preprocessor),
+            ('vectorize', vectorizer),
+            ('feature_select', feature_selector),
+            ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
         ])
+        log_step("🔧 Training model with optimized pipeline...")
+        # Hyperparameter tuning for datasets with sufficient samples
+        if len(X_train) >= 20:
+            log_step("⚙️ Performing hyperparameter tuning...")
+            param_grid = {
+                'model__C': [0.1, 1, 10],
+                'model__penalty': ['l2']
+            }
+            cv_folds = max(2, min(3, len(X_train) // 10))
+            grid_search = GridSearchCV(
+                pipeline,
+                param_grid,
+                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
+                scoring='f1_weighted',
+                n_jobs=1
+            )
+            grid_search.fit(X_train, y_train)
+            best_pipeline = grid_search.best_estimator_
+            log_step(f"✅ Best parameters: {grid_search.best_params_}")
+            log_step(f"✅ Best CV score: {grid_search.best_score_:.4f}")
+        else:
+            log_step("⚙️ Using simple training for small dataset...")
+            pipeline.fit(X_train, y_train)
+            best_pipeline = pipeline
+        # Evaluate model
+        y_pred = best_pipeline.predict(X_test)
         accuracy = accuracy_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred, average='weighted')
+        log_step(f"📈 Model Performance:")
+        log_step(f"   - Accuracy: {accuracy:.4f}")
+        log_step(f"   - F1 Score: {f1:.4f}")
+        # Save model artifacts
         log_step("💾 Saving model artifacts...")
+        # Save the complete pipeline
+        joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
         log_step("✅ Saved complete pipeline")
+        # Save individual components for compatibility
+        joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
+        joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
+        log_step("✅ Saved individual model components")
         # Generate comprehensive metadata
         metadata = {
             "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            "model_type": "logistic_regression",
+            "training_method": "initial_setup",
+            "dataset_size": len(df),
             "train_size": len(X_train),
             "test_size": len(X_test),
+            "test_accuracy": float(accuracy),
+            "test_f1": float(f1),
+            "hyperparameter_tuning": len(X_train) >= 20,
+            "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
+            "class_distribution": df['label'].value_counts().to_dict(),
+            "training_config": {
                 "max_features": 5000,
                 "ngram_range": [1, 2],
+                "feature_selection_k": 2000,
+                "test_size": 0.2
             },
+            "timestamp": datetime.now().isoformat(),
+            "initialization_notes": "Model trained during system initialization",
+            "ready_for_production": True
         }
+        # Save metadata
         with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
         log_step("✅ Saved comprehensive metadata")
+        log_step(f"🎉 Initial model training completed successfully!")
+        log_step(f"📊 Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
         return True
     except Exception as e:
         # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
+            "event": "System initialized successfully with trained model",
             "level": "INFO"
         }]
             json.dump(activity_log, f, indent=2)
         # Create empty monitoring logs
+        log_dirs = ["/tmp/logs"]
+        for log_dir in log_dirs:
+            Path(log_dir).mkdir(parents=True, exist_ok=True)
         with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
+        with open("/tmp/logs/scheduler_execution.json", 'w') as f:
+            json.dump([], f)
         log_step("✅ Initial log files created")
         return True
         return False
+def validate_installation():
+    """Validate that the system is properly set up"""
+    log_step("🔍 Validating system installation...")
+    validation_checks = []
+    # Check model files
+    model_exists, model_files = check_model_exists()
+    validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
+    # Check data files
+    data_exists, data_files = check_training_data_exists()
+    validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
+    # Check directories
+    required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
+    dirs_exist = all(Path(d).exists() for d in required_dirs)
+    validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
+    # Check logs
+    log_exists = Path("/tmp/activity_log.json").exists()
+    validation_checks.append(("Log Files", log_exists, "Activity log created"))
+    # Test model loading
+    model_loadable = False
+    try:
+        import joblib
+        pipeline = joblib.load("/tmp/pipeline.pkl")
+        test_prediction = pipeline.predict(["This is a test news article"])
+        model_loadable = True
+        validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
+    except Exception as e:
+        validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
+    # Print validation results
+    log_step("📋 Validation Results:")
+    all_passed = True
+    for check_name, passed, details in validation_checks:
+        status = "✅ PASS" if passed else "❌ FAIL"
+        log_step(f"   {status} {check_name}: {details}")
+        if not passed:
+            all_passed = False
+    return all_passed, validation_checks
 def main():
+    """Main initialization function with smart training logic"""
+    log_step("🚀 Starting intelligent system initialization...")
+    # Check if model already exists
+    model_exists, existing_model_files = check_model_exists()
+    if model_exists:
+        log_step("🎯 EXISTING INSTALLATION DETECTED")
+        log_step("📄 Found existing model files - skipping training")
+        # Load existing metadata to show info
+        try:
+            with open("/tmp/metadata.json", 'r') as f:
+                metadata = json.load(f)
+            log_step(f"📊 Existing Model Info:")
+            log_step(f"   - Version: {metadata.get('model_version', 'Unknown')}")
+            log_step(f"   - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
+            log_step(f"   - F1 Score: {metadata.get('test_f1', 'Unknown')}")
+            log_step(f"   - Created: {metadata.get('timestamp', 'Unknown')}")
+        except Exception as e:
+            log_step(f"⚠️ Could not read existing metadata: {e}")
     else:
+        log_step("🆕 FIRST-TIME INSTALLATION DETECTED")
+        log_step("🔧 No existing model found - will train new model")
+    # Run initialization steps
+    steps = [
+        ("Directory Creation", create_directories),
+        ("Dataset Copy", copy_original_datasets),
+        ("Dataset Preparation", create_minimal_dataset),
+        ("Log Creation", create_initial_logs)
+    ]
+    # Add training step only if model doesn't exist
+    if not model_exists:
+        steps.insert(-1, ("🤖 Model Training", run_initial_training))
     failed_steps = []
+    for step_name, step_function in steps:
         try:
+            log_step(f"▶️ Starting: {step_name}")
             if step_function():
+                log_step(f"✅ {step_name} completed")
             else:
                 log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
         except Exception as e:
+            log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
+    # Final validation
+    log_step("🔍 Running final system validation...")
+    validation_passed, validation_results = validate_installation()
+    # Summary
+    log_step("=" * 60)
     if failed_steps:
+        log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
+        log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
+    else:
+        log_step("🎉 System initialization completed successfully!")
+    if validation_passed:
+        log_step("✅ All validation checks passed!")
+        log_step("🚀 System is ready for use!")
+        if not model_exists:
+            log_step("🤖 NEW MODEL TRAINED AND READY")
+            log_step("📊 You can now start making predictions!")
         else:
+            log_step("🔄 EXISTING MODEL VALIDATED AND READY")
+            log_step("📊 System restored from previous installation!")
     else:
+        log_step("❌ Some validation checks failed")
+        log_step("🔧 Manual intervention may be required")
+    log_step("=" * 60)
 if __name__ == "__main__":
     main()