Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 19

Commit

c745fee

1 Parent(s): ae5a4d8

Update initialize_system.py

Browse files

Restored previous working version

Files changed (1) hide show

initialize_system.py +93 -216

initialize_system.py CHANGED Viewed

@@ -7,176 +7,108 @@ from pathlib import Path
 from datetime import datetime
-# =============================================================================
-# CENTRALIZED PATH CONFIGURATION - MATCHES OTHER COMPONENTS
-# =============================================================================
-class PathConfig:
-    """Centralized path management to ensure consistency across all components"""
-    # Environment detection
-    if os.getenv("HF_SPACES_BUILD") == "1" or os.getenv("SPACE_ID"):
-        BASE_DIR = Path("/app/persistent")
-        ENVIRONMENT = "huggingface_spaces"
-    else:
-        BASE_DIR = Path("/tmp")
-        ENVIRONMENT = "local"
-    # Base directories
-    DATA_DIR = BASE_DIR / "data"
-    MODEL_DIR = BASE_DIR / "model"
-    LOGS_DIR = BASE_DIR / "logs"
-    RESULTS_DIR = BASE_DIR / "results"
-    # Model files - CONSISTENT PATHS
-    MODEL_FILE = MODEL_DIR / "model.pkl"
-    VECTORIZER_FILE = MODEL_DIR / "vectorizer.pkl"
-    PIPELINE_FILE = MODEL_DIR / "pipeline.pkl"
-    METADATA_FILE = BASE_DIR / "metadata.json"
-    # Data files
-    COMBINED_DATASET = DATA_DIR / "combined_dataset.csv"
-    KAGGLE_FAKE_DATA = DATA_DIR / "kaggle" / "Fake.csv"
-    KAGGLE_TRUE_DATA = DATA_DIR / "kaggle" / "True.csv"
-    # Log files
-    ACTIVITY_LOG = BASE_DIR / "activity_log.json"
-    MONITORING_LOG = LOGS_DIR / "monitoring_log.json"
-    @classmethod
-    def ensure_directories(cls):
-        """Create all required directories with proper permissions"""
-        directories = [cls.DATA_DIR, cls.MODEL_DIR, cls.LOGS_DIR, cls.RESULTS_DIR]
-        for directory in directories:
-            try:
-                directory.mkdir(parents=True, exist_ok=True, mode=0o755)
-                print(f"Directory ensured: {directory}")
-            except Exception as e:
-                print(f"Error creating {directory}: {e}")
 def log_step(message):
     """Log initialization steps"""
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
 def create_directories():
-    """Create necessary directories using centralized paths"""
     log_step("Creating directory structure...")
-    try:
-        PathConfig.ensure_directories()
-        # Create kaggle subdirectory
-        kaggle_dir = PathConfig.DATA_DIR / "kaggle"
-        kaggle_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
-        log_step(f"Created {PathConfig.DATA_DIR}")
-        log_step(f"Created {PathConfig.MODEL_DIR}")
-        log_step(f"Created {PathConfig.LOGS_DIR}")
-        log_step(f"Created {kaggle_dir}")
-        return True
-    except Exception as e:
-        log_step(f"Directory Creation failed: {e}")
-        return False
 def copy_original_datasets():
-    """Copy original datasets using centralized paths"""
     log_step("Copying original datasets...")
     source_files = [
-        ("/app/data/kaggle/Fake.csv", PathConfig.KAGGLE_FAKE_DATA),
-        ("/app/data/kaggle/True.csv", PathConfig.KAGGLE_TRUE_DATA),
-        ("/app/data/combined_dataset.csv", PathConfig.COMBINED_DATASET)
     ]
     copied_count = 0
     for source, dest in source_files:
-        try:
-            if Path(source).exists():
-                dest.parent.mkdir(parents=True, exist_ok=True)
-                shutil.copy(source, dest)
-                log_step(f"Copied {source} to {dest}")
-                copied_count += 1
-            else:
-                log_step(f"Source file not found: {source}")
-        except Exception as e:
-            log_step(f"Failed to copy {source}: {e}")
-    if copied_count > 0:
-        return True
-    else:
-        log_step("No files copied, but not considered failure")
-        return True
 def create_minimal_dataset():
     """Create a minimal dataset if original doesn't exist"""
     log_step("Creating minimal dataset...")
-    combined_path = PathConfig.COMBINED_DATASET
     if combined_path.exists():
-        log_step("Combined dataset already exists")
         return True
-    try:
-        # Ensure data directory exists
-        combined_path.parent.mkdir(parents=True, exist_ok=True)
-        # Create minimal training data
-        minimal_data = pd.DataFrame({
-            'text': [
-                'Scientists discover new species in Amazon rainforest',
-                'SHOCKING: Aliens spotted in Area 51, government confirms existence',
-                'Local authorities report increase in renewable energy adoption',
-                'You won\'t believe what happens when you eat this miracle fruit',
-                'Economic indicators show steady growth in manufacturing sector',
-                'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
-                'Research shows positive effects of meditation on mental health',
-                'Government hiding truth about flat earth, conspiracy theorists claim',
-                'New study reveals benefits of regular exercise for elderly',
-                'BREAKING: Time travel confirmed by underground scientists'
-            ],
-            'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0=Real, 1=Fake
-        })
-        minimal_data.to_csv(combined_path, index=False)
-        log_step(f"Created minimal dataset with {len(minimal_data)} samples")
-        return True
-    except Exception as e:
-        log_step(f"Failed to create minimal dataset: {e}")
-        return False
 def run_initial_training():
-    """Run basic model training using centralized paths"""
     log_step("Starting initial model training...")
     try:
-        # Check if model already exists - FIXED PATHS
-        if PathConfig.PIPELINE_FILE.exists():
-            log_step("Model files already exist")
             return True
         # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
         from sklearn.model_selection import train_test_split
-        from sklearn.pipeline import Pipeline
         from sklearn.metrics import accuracy_score
         import joblib
-        # Load dataset - FIXED PATH
-        if not PathConfig.COMBINED_DATASET.exists():
-            log_step("No dataset available for training")
             return False
-        df = pd.read_csv(PathConfig.COMBINED_DATASET)
         log_step(f"Loaded dataset with {len(df)} samples")
         # Prepare data
@@ -188,126 +120,78 @@ def run_initial_training():
             X, y, test_size=0.2, random_state=42, stratify=y
         )
-        # Create pipeline
-        pipeline = Pipeline([
-            ('vectorize', TfidfVectorizer(
-                max_features=5000,
-                stop_words='english',
-                ngram_range=(1, 2)
-            )),
-            ('model', LogisticRegression(max_iter=1000, random_state=42))
-        ])
-        # Train pipeline
-        pipeline.fit(X_train, y_train)
         # Evaluate
-        y_pred = pipeline.predict(X_test)
         accuracy = accuracy_score(y_test, y_pred)
-        # Ensure model directory exists
-        PathConfig.MODEL_DIR.mkdir(parents=True, exist_ok=True)
-        # Save complete pipeline - FIXED PATH
-        joblib.dump(pipeline, PathConfig.PIPELINE_FILE)
-        # Save individual components for backward compatibility - FIXED PATHS
-        joblib.dump(pipeline.named_steps['model'], PathConfig.MODEL_FILE)
-        joblib.dump(pipeline.named_steps['vectorize'], PathConfig.VECTORIZER_FILE)
-        # Save metadata - FIXED PATH
         metadata = {
             "model_version": "v1.0_init",
             "test_accuracy": float(accuracy),
             "train_size": len(X_train),
             "test_size": len(X_test),
             "timestamp": datetime.now().isoformat(),
-            "training_method": "initialization",
-            "model_type": "logistic_regression",
-            "environment": PathConfig.ENVIRONMENT,
-            "paths": {
-                "pipeline_file": str(PathConfig.PIPELINE_FILE),
-                "model_file": str(PathConfig.MODEL_FILE),
-                "vectorizer_file": str(PathConfig.VECTORIZER_FILE)
-            }
         }
-        with open(PathConfig.METADATA_FILE, 'w') as f:
             json.dump(metadata, f, indent=2)
-        log_step(f"Training completed successfully, accuracy: {accuracy:.4f}")
-        log_step(f"Pipeline saved to: {PathConfig.PIPELINE_FILE}")
-        log_step(f"Metadata saved to: {PathConfig.METADATA_FILE}")
         return True
     except Exception as e:
-        log_step(f"Training failed: {str(e)}")
-        import traceback
-        log_step(f"Full traceback: {traceback.format_exc()}")
         return False
 def create_initial_logs():
-    """Create initial log files using centralized paths"""
     log_step("Creating initial log files...")
     try:
-        # Ensure logs directory exists
-        PathConfig.LOGS_DIR.mkdir(parents=True, exist_ok=True)
-        # Activity log - FIXED PATH
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
-            "event": "System initialized successfully",
-            "environment": PathConfig.ENVIRONMENT,
-            "base_directory": str(PathConfig.BASE_DIR)
         }]
-        with open(PathConfig.ACTIVITY_LOG, 'w') as f:
             json.dump(activity_log, f, indent=2)
-        # Create empty monitoring logs - FIXED PATH
-        with open(PathConfig.MONITORING_LOG, 'w') as f:
             json.dump([], f)
-        log_step("Initial log files created")
-        log_step(f"Activity log: {PathConfig.ACTIVITY_LOG}")
-        log_step(f"Monitoring log: {PathConfig.MONITORING_LOG}")
         return True
     except Exception as e:
-        log_step(f"Log creation failed: {str(e)}")
         return False
-def verify_initialization():
-    """Verify that initialization was successful"""
-    log_step("Verifying initialization...")
-    required_files = [
-        PathConfig.PIPELINE_FILE,
-        PathConfig.METADATA_FILE,
-        PathConfig.COMBINED_DATASET
-    ]
-    missing_files = []
-    for file_path in required_files:
-        if not file_path.exists():
-            missing_files.append(str(file_path))
-    if missing_files:
-        log_step(f"Missing required files: {missing_files}")
-        return False
-    else:
-        log_step("All required files present")
-        return True
 def main():
     """Main initialization function"""
-    log_step("Starting system initialization...")
-    log_step(f"Environment: {PathConfig.ENVIRONMENT}")
-    log_step(f"Base directory: {PathConfig.BASE_DIR}")
     steps = [
         ("Directory Creation", create_directories),
@@ -322,30 +206,23 @@ def main():
     for step_name, step_function in steps:
         try:
             if step_function():
-                log_step(f"{step_name} completed")
             else:
-                log_step(f"{step_name} failed")
                 failed_steps.append(step_name)
         except Exception as e:
-            log_step(f"{step_name} failed: {str(e)}")
             failed_steps.append(step_name)
-    # Final verification
-    if not failed_steps:
-        if verify_initialization():
-            log_step("System initialization completed successfully!")
-        else:
-            log_step("Initialization verification failed")
-            failed_steps.append("Verification")
     if failed_steps:
-        log_step(f"Initialization completed with {len(failed_steps)} failed steps")
         log_step(f"Failed: {', '.join(failed_steps)}")
     else:
-        log_step("System initialization completed successfully!")
     log_step("System ready for use!")
 if __name__ == "__main__":
-    main()

 from datetime import datetime
 def log_step(message):
     """Log initialization steps"""
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
 def create_directories():
+    """Create necessary directories"""
     log_step("Creating directory structure...")
+    directories = [
+        "/tmp/data",
+        "/tmp/model",
+        "/tmp/logs"
+    ]
+    for dir_path in directories:
+        Path(dir_path).mkdir(parents=True, exist_ok=True)
+        log_step(f"✅ Created {dir_path}")
 def copy_original_datasets():
+    """Copy original datasets from /app to /tmp"""
     log_step("Copying original datasets...")
     source_files = [
+        ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
+        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
+        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
     ]
     copied_count = 0
     for source, dest in source_files:
+        if Path(source).exists():
+            Path(dest).parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(source, dest)
+            log_step(f"✅ Copied {source} to {dest}")
+            copied_count += 1
+        else:
+            log_step(f"⚠️ Source file not found: {source}")
+    return copied_count > 0
 def create_minimal_dataset():
     """Create a minimal dataset if original doesn't exist"""
     log_step("Creating minimal dataset...")
+    combined_path = Path("/tmp/data/combined_dataset.csv")
     if combined_path.exists():
+        log_step("✅ Combined dataset already exists")
         return True
+    # Create minimal training data
+    minimal_data = pd.DataFrame({
+        'text': [
+            'Scientists discover new species in Amazon rainforest',
+            'SHOCKING: Aliens spotted in Area 51, government confirms existence',
+            'Local authorities report increase in renewable energy adoption',
+            'You won\'t believe what happens when you eat this miracle fruit',
+            'Economic indicators show steady growth in manufacturing sector',
+            'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
+            'Research shows positive effects of meditation on mental health',
+            'Government hiding truth about flat earth, conspiracy theorists claim',
+            'New study reveals benefits of regular exercise for elderly',
+            'BREAKING: Time travel confirmed by underground scientists'
+        ],
+        'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0=Real, 1=Fake
+    })
+    minimal_data.to_csv(combined_path, index=False)
+    log_step(f"✅ Created minimal dataset with {len(minimal_data)} samples")
+    return True
 def run_initial_training():
+    """Run basic model training"""
     log_step("Starting initial model training...")
     try:
+        # Check if model already exists
+        model_path = Path("/tmp/model.pkl")
+        vectorizer_path = Path("/tmp/vectorizer.pkl")
+        if model_path.exists() and vectorizer_path.exists():
+            log_step("✅ Model files already exist")
             return True
         # Import required libraries
         from sklearn.feature_extraction.text import TfidfVectorizer
         from sklearn.linear_model import LogisticRegression
         from sklearn.model_selection import train_test_split
         from sklearn.metrics import accuracy_score
         import joblib
+        # Load dataset
+        dataset_path = Path("/tmp/data/combined_dataset.csv")
+        if not dataset_path.exists():
+            log_step("❌ No dataset available for training")
             return False
+        df = pd.read_csv(dataset_path)
         log_step(f"Loaded dataset with {len(df)} samples")
         # Prepare data
             X, y, test_size=0.2, random_state=42, stratify=y
         )
+        # Vectorization
+        vectorizer = TfidfVectorizer(
+            max_features=5000,
+            stop_words='english',
+            ngram_range=(1, 2)
+        )
+        X_train_vec = vectorizer.fit_transform(X_train)
+        X_test_vec = vectorizer.transform(X_test)
+        # Train model
+        model = LogisticRegression(max_iter=1000, random_state=42)
+        model.fit(X_train_vec, y_train)
         # Evaluate
+        y_pred = model.predict(X_test_vec)
         accuracy = accuracy_score(y_test, y_pred)
+        # Save model
+        joblib.dump(model, "/tmp/model.pkl")
+        joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
+        # Save metadata
         metadata = {
             "model_version": "v1.0_init",
             "test_accuracy": float(accuracy),
             "train_size": len(X_train),
             "test_size": len(X_test),
             "timestamp": datetime.now().isoformat(),
+            "training_method": "initialization"
         }
+        with open("/tmp/metadata.json", 'w') as f:
             json.dump(metadata, f, indent=2)
+        log_step(
+            f"✅ Training completed successfully, accuracy: {accuracy:.4f}")
         return True
     except Exception as e:
+        log_step(f"❌ Training failed: {str(e)}")
         return False
 def create_initial_logs():
+    """Create initial log files"""
     log_step("Creating initial log files...")
     try:
+        # Activity log
         activity_log = [{
             "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
+            "event": "System initialized successfully"
         }]
+        with open("/tmp/activity_log.json", 'w') as f:
             json.dump(activity_log, f, indent=2)
+        # Create empty monitoring logs
+        with open("/tmp/logs/monitoring_log.json", 'w') as f:
             json.dump([], f)
+        log_step("✅ Initial log files created")
         return True
     except Exception as e:
+        log_step(f"❌ Log creation failed: {str(e)}")
         return False
 def main():
     """Main initialization function"""
+    log_step("🚀 Starting system initialization...")
     steps = [
         ("Directory Creation", create_directories),
     for step_name, step_function in steps:
         try:
             if step_function():
+                log_step(f"✅ {step_name} completed")
             else:
+                log_step(f"❌ {step_name} failed")
                 failed_steps.append(step_name)
         except Exception as e:
+            log_step(f"❌ {step_name} failed: {str(e)}")
             failed_steps.append(step_name)
     if failed_steps:
+        log_step(
+            f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
         log_step(f"Failed: {', '.join(failed_steps)}")
     else:
+        log_step("🎉 System initialization completed successfully!")
     log_step("System ready for use!")
 if __name__ == "__main__":
+    main()