Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Jul 5

Commit

cc910a7

1 Parent(s): 57fc42f

Create initialize_system.py

Browse files

Files changed (1) hide show

initialize_system.py +218 -0

initialize_system.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import os
+import sys
+import shutil
+import pandas as pd
+import json
+from pathlib import Path
+from datetime import datetime
+def log_step(message):
+    """Log initialization steps"""
+    print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
+def create_directories():
+    """Create necessary directories"""
+    log_step("Creating directory structure...")
+    directories = [
+        "/tmp/data",
+        "/tmp/model",
+        "/tmp/logs"
+    ]
+    for dir_path in directories:
+        Path(dir_path).mkdir(parents=True, exist_ok=True)
+        log_step(f"✅ Created {dir_path}")
+def copy_original_datasets():
+    """Copy original datasets from /app to /tmp"""
+    log_step("Copying original datasets...")
+    source_files = [
+        ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
+        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
+        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
+    ]
+    copied_count = 0
+    for source, dest in source_files:
+        if Path(source).exists():
+            Path(dest).parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(source, dest)
+            log_step(f"✅ Copied {source} to {dest}")
+            copied_count += 1
+        else:
+            log_step(f"⚠️ Source file not found: {source}")
+    return copied_count > 0
+def create_minimal_dataset():
+    """Create a minimal dataset if original doesn't exist"""
+    log_step("Creating minimal dataset...")
+    combined_path = Path("/tmp/data/combined_dataset.csv")
+    if combined_path.exists():
+        log_step("✅ Combined dataset already exists")
+        return True
+    # Create minimal training data
+    minimal_data = pd.DataFrame({
+        'text': [
+            'Scientists discover new species in Amazon rainforest',
+            'SHOCKING: Aliens spotted in Area 51, government confirms existence',
+            'Local authorities report increase in renewable energy adoption',
+            'You won\'t believe what happens when you eat this miracle fruit',
+            'Economic indicators show steady growth in manufacturing sector',
+            'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
+            'Research shows positive effects of meditation on mental health',
+            'Government hiding truth about flat earth, conspiracy theorists claim',
+            'New study reveals benefits of regular exercise for elderly',
+            'BREAKING: Time travel confirmed by underground scientists'
+        ],
+        'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0=Real, 1=Fake
+    })
+    minimal_data.to_csv(combined_path, index=False)
+    log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples")
+    return True
+def run_initial_training():
+    """Run basic model training"""
+    log_step("Starting initial model training...")
+    try:
+        # Check if model already exists
+        model_path = Path("/tmp/model.pkl")
+        vectorizer_path = Path("/tmp/vectorizer.pkl")
+        if model_path.exists() and vectorizer_path.exists():
+            log_step("✅ Model files already exist")
+            return True
+        # Import required libraries
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.linear_model import LogisticRegression
+        from sklearn.model_selection import train_test_split
+        from sklearn.metrics import accuracy_score
+        import joblib
+        # Load dataset
+        dataset_path = Path("/tmp/data/combined_dataset.csv")
+        if not dataset_path.exists():
+            log_step("❌ No dataset available for training")
+            return False
+        df = pd.read_csv(dataset_path)
+        log_step(f"Loaded dataset with {len(df)} samples")
+        # Prepare data
+        X = df['text'].values
+        y = df['label'].values
+        # Train-test split
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        # Vectorization
+        vectorizer = TfidfVectorizer(
+            max_features=5000,
+            stop_words='english',
+            ngram_range=(1, 2)
+        )
+        X_train_vec = vectorizer.fit_transform(X_train)
+        X_test_vec = vectorizer.transform(X_test)
+        # Train model
+        model = LogisticRegression(max_iter=1000, random_state=42)
+        model.fit(X_train_vec, y_train)
+        # Evaluate
+        y_pred = model.predict(X_test_vec)
+        accuracy = accuracy_score(y_test, y_pred)
+        # Save model
+        joblib.dump(model, "/tmp/model.pkl")
+        joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
+        # Save metadata
+        metadata = {
+            "model_version": "v1.0_init",
+            "test_accuracy": float(accuracy),
+            "train_size": len(X_train),
+            "test_size": len(X_test),
+            "timestamp": datetime.now().isoformat(),
+            "training_method": "initialization"
+        }
+        with open("/tmp/metadata.json", 'w') as f:
+            json.dump(metadata, f, indent=2)
+        log_step(f"✅ Training completed successfully, accuracy: {accuracy:.4f}")
+        return True
+    except Exception as e:
+        log_step(f"❌ Training failed: {str(e)}")
+        return False
+def create_initial_logs():
+    """Create initial log files"""
+    log_step("Creating initial log files...")
+    try:
+        # Activity log
+        activity_log = [{
+            "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
+            "event": "System initialized successfully"
+        }]
+        with open("/tmp/activity_log.json", 'w') as f:
+            json.dump(activity_log, f, indent=2)
+        # Create empty monitoring logs
+        with open("/tmp/logs/monitoring_log.json", 'w') as f:
+            json.dump([], f)
+        log_step("✅ Initial log files created")
+        return True
+    except Exception as e:
+        log_step(f"❌ Log creation failed: {str(e)}")
+        return False
+def main():
+    """Main initialization function"""
+    log_step("🚀 Starting system initialization...")
+    steps = [
+        ("Directory Creation", create_directories),
+        ("Dataset Copy", copy_original_datasets),
+        ("Minimal Dataset", create_minimal_dataset),
+        ("Model Training", run_initial_training),
+        ("Log Creation", create_initial_logs)
+    ]
+    failed_steps = []
+    for step_name, step_function in steps:
+        try:
+            if step_function():
+                log_step(f"✅ {step_name} completed")
+            else:
+                log_step(f"❌ {step_name} failed")
+                failed_steps.append(step_name)
+        except Exception as e:
+            log_step(f"❌ {step_name} failed: {str(e)}")
+            failed_steps.append(step_name)
+    if failed_steps:
+        log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
+        log_step(f"Failed: {', '.join(failed_steps)}")
+    else:
+        log_step("🎉 System initialization completed successfully!")
+    log_step("System ready for use!")
+if __name__ == "__main__":
+    main()