Ahmedik95316 commited on
Commit
4a1bc0d
Β·
1 Parent(s): c678ee1

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +227 -296
initialize_system.py CHANGED
@@ -12,42 +12,61 @@ def log_step(message):
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
- def check_model_exists():
16
- """Check if trained model already exists"""
17
- model_files = [
18
- Path("/tmp/pipeline.pkl"),
19
- Path("/tmp/model.pkl"),
20
- Path("/tmp/vectorizer.pkl"),
21
- Path("/tmp/metadata.json")
 
22
  ]
23
 
24
- existing_files = [f for f in model_files if f.exists()]
 
 
25
 
26
- if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
27
- log_step(f"βœ… Found {len(existing_files)} existing model files")
28
- return True, existing_files
29
- else:
30
- log_step(f"❌ Missing model files - only found {len(existing_files)}")
31
- return False, existing_files
32
-
33
-
34
- def check_training_data_exists():
35
- """Check if training data is available"""
36
- data_files = [
37
- Path("/tmp/data/combined_dataset.csv"),
38
- Path("/app/data/combined_dataset.csv"),
39
- Path("/tmp/data/kaggle/Fake.csv"),
40
- Path("/tmp/data/kaggle/True.csv")
41
- ]
42
 
43
- existing_data = [f for f in data_files if f.exists()]
 
 
 
44
 
45
- if existing_data:
46
- log_step(f"βœ… Found training data: {[str(f) for f in existing_data]}")
47
- return True, existing_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  else:
49
- log_step("❌ No training data found")
50
- return False, []
 
 
51
 
52
 
53
  def create_directories():
@@ -56,11 +75,8 @@ def create_directories():
56
 
57
  directories = [
58
  "/tmp/data",
59
- "/tmp/data/kaggle",
60
  "/tmp/model",
61
- "/tmp/logs",
62
- "/tmp/results",
63
- "/tmp/backups"
64
  ]
65
 
66
  for dir_path in directories:
@@ -74,11 +90,8 @@ def copy_original_datasets():
74
 
75
  source_files = [
76
  ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
77
- ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
78
- ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
79
- ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
80
- ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
81
- ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
82
  ]
83
 
84
  copied_count = 0
@@ -96,84 +109,70 @@ def copy_original_datasets():
96
 
97
  def create_minimal_dataset():
98
  """Create a minimal dataset if original doesn't exist"""
99
- log_step("Creating minimal dataset...")
100
 
101
  combined_path = Path("/tmp/data/combined_dataset.csv")
102
 
103
  if combined_path.exists():
104
- log_step("βœ… Combined dataset already exists")
 
 
105
  return True
106
 
107
- # Create minimal training data with more samples for better training
 
 
108
  minimal_data = pd.DataFrame({
109
  'text': [
110
- # Real news samples
111
- 'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
112
- 'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
113
- 'Local authorities report significant improvements in air quality following new environmental regulations',
114
- 'Research published in Nature journal shows promising results for renewable energy storage technology',
115
- 'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
116
- 'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
117
- 'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
118
- 'Transportation department announces infrastructure improvements to major highways across the region',
119
- 'Educational institutions implement new digital learning platforms to enhance student engagement',
120
- 'Agricultural studies reveal improved crop yields through sustainable farming practices',
121
- 'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
122
- 'Municipal government approves budget for public transportation expansion project in urban areas',
123
- 'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
124
- 'International trade agreements show positive impact on local businesses and job creation',
125
- 'Environmental protection agency releases report on water quality improvements in major rivers',
126
 
127
- # Fake news samples
128
- 'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
129
- 'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
130
- 'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
131
- 'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
132
- 'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
133
- 'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
134
- 'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
135
- 'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
136
- 'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
137
- 'WARNING: New technology allows complete thought reading through WiFi signals in your home',
138
- 'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
139
- 'UNCOVERED: All news media controlled by single person living in secret underground bunker',
140
- 'PROOF: Time travel already exists but only available to wealthy elite who control world events',
141
- 'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
142
- 'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
143
  ],
144
- 'label': [
145
- # Real news labels (0)
146
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147
- # Fake news labels (1)
148
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
149
- ]
150
  })
151
 
152
  minimal_data.to_csv(combined_path, index=False)
153
- log_step(f"βœ… Created enhanced minimal dataset with {len(minimal_data)} samples")
154
- log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
155
- log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
156
  return True
157
 
158
 
159
- def run_initial_training():
160
- """Run comprehensive model training for first-time setup"""
161
- log_step("πŸš€ Starting comprehensive model training for first-time setup...")
162
 
163
  try:
164
- # Import training modules
165
  from sklearn.feature_extraction.text import TfidfVectorizer
166
  from sklearn.linear_model import LogisticRegression
167
- from sklearn.ensemble import RandomForestClassifier
168
- from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
169
  from sklearn.pipeline import Pipeline
170
- from sklearn.feature_selection import SelectKBest, chi2
 
171
  from sklearn.preprocessing import FunctionTransformer
172
- from sklearn.metrics import accuracy_score, f1_score, classification_report
173
  import joblib
174
  import re
175
 
176
- # Text preprocessing function (same as in train.py)
177
  def preprocess_text_function(texts):
178
  def clean_single_text(text):
179
  text = str(text)
@@ -200,135 +199,112 @@ def run_initial_training():
200
  df = pd.read_csv(dataset_path)
201
  log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
202
 
203
- # Data validation and cleaning
204
- df = df.dropna(subset=['text', 'label'])
205
- df = df[df['text'].astype(str).str.len() > 10]
206
-
207
- log_step(f"πŸ“Š After cleaning: {len(df)} samples")
208
- log_step(f"πŸ“Š Class distribution: {df['label'].value_counts().to_dict()}")
209
-
210
  # Prepare data
211
  X = df['text'].values
212
  y = df['label'].values
213
 
 
 
 
 
214
  # Train-test split
 
215
  X_train, X_test, y_train, y_test = train_test_split(
216
- X, y, test_size=0.2, random_state=42, stratify=y
 
217
  )
218
 
219
  log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
220
 
221
- # Create comprehensive pipeline
222
  text_preprocessor = FunctionTransformer(
223
  func=preprocess_text_function,
224
  validate=False
225
  )
226
 
227
- vectorizer = TfidfVectorizer(
228
- max_features=5000,
229
- min_df=1,
230
- max_df=0.95,
231
- ngram_range=(1, 2),
232
- stop_words='english',
233
- sublinear_tf=True,
234
- norm='l2'
235
- )
236
-
237
- feature_selector = SelectKBest(
238
- score_func=chi2,
239
- k=2000
240
- )
241
-
242
- # Create pipeline with Logistic Regression
243
  pipeline = Pipeline([
244
  ('preprocess', text_preprocessor),
245
- ('vectorize', vectorizer),
246
- ('feature_select', feature_selector),
247
- ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
 
 
 
 
 
 
 
 
 
 
 
248
  ])
249
 
250
- log_step("πŸ”§ Training model with optimized pipeline...")
251
-
252
- # Hyperparameter tuning for datasets with sufficient samples
253
- if len(X_train) >= 20:
254
- log_step("βš™οΈ Performing hyperparameter tuning...")
255
- param_grid = {
256
- 'model__C': [0.1, 1, 10],
257
- 'model__penalty': ['l2']
258
- }
259
-
260
- cv_folds = max(2, min(3, len(X_train) // 10))
261
- grid_search = GridSearchCV(
262
- pipeline,
263
- param_grid,
264
- cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
265
- scoring='f1_weighted',
266
- n_jobs=1
267
- )
268
-
269
- grid_search.fit(X_train, y_train)
270
- best_pipeline = grid_search.best_estimator_
271
-
272
- log_step(f"βœ… Best parameters: {grid_search.best_params_}")
273
- log_step(f"βœ… Best CV score: {grid_search.best_score_:.4f}")
274
- else:
275
- log_step("βš™οΈ Using simple training for small dataset...")
276
- pipeline.fit(X_train, y_train)
277
- best_pipeline = pipeline
278
 
279
- # Evaluate model
280
- y_pred = best_pipeline.predict(X_test)
 
 
281
  accuracy = accuracy_score(y_test, y_pred)
 
 
282
  f1 = f1_score(y_test, y_pred, average='weighted')
283
 
284
- log_step(f"πŸ“ˆ Model Performance:")
285
- log_step(f" - Accuracy: {accuracy:.4f}")
286
- log_step(f" - F1 Score: {f1:.4f}")
 
 
287
 
288
- # Save model artifacts
289
  log_step("πŸ’Ύ Saving model artifacts...")
290
-
291
- # Save the complete pipeline
292
- joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
293
  log_step("βœ… Saved complete pipeline")
294
 
295
- # Save individual components for compatibility
296
- joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
297
- joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
298
- log_step("βœ… Saved individual model components")
299
 
300
  # Generate comprehensive metadata
301
  metadata = {
302
  "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
303
- "model_type": "logistic_regression",
304
- "training_method": "initial_setup",
305
- "dataset_size": len(df),
306
- "train_size": len(X_train),
307
- "test_size": len(X_test),
308
  "test_accuracy": float(accuracy),
 
 
309
  "test_f1": float(f1),
310
- "hyperparameter_tuning": len(X_train) >= 20,
311
- "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
312
- "class_distribution": df['label'].value_counts().to_dict(),
313
- "training_config": {
 
 
 
314
  "max_features": 5000,
315
  "ngram_range": [1, 2],
316
- "feature_selection_k": 2000,
317
- "test_size": 0.2
318
  },
319
- "timestamp": datetime.now().isoformat(),
320
- "initialization_notes": "Model trained during system initialization",
321
- "ready_for_production": True
 
 
322
  }
323
 
324
- # Save metadata
325
  with open("/tmp/metadata.json", 'w') as f:
326
  json.dump(metadata, f, indent=2)
327
 
328
  log_step("βœ… Saved comprehensive metadata")
329
- log_step(f"πŸŽ‰ Initial model training completed successfully!")
330
- log_step(f"πŸ“Š Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
331
-
 
332
  return True
333
 
334
  except Exception as e:
@@ -346,7 +322,7 @@ def create_initial_logs():
346
  # Activity log
347
  activity_log = [{
348
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
349
- "event": "System initialized successfully with trained model",
350
  "level": "INFO"
351
  }]
352
 
@@ -354,16 +330,9 @@ def create_initial_logs():
354
  json.dump(activity_log, f, indent=2)
355
 
356
  # Create empty monitoring logs
357
- log_dirs = ["/tmp/logs"]
358
- for log_dir in log_dirs:
359
- Path(log_dir).mkdir(parents=True, exist_ok=True)
360
-
361
  with open("/tmp/logs/monitoring_log.json", 'w') as f:
362
  json.dump([], f)
363
 
364
- with open("/tmp/logs/scheduler_execution.json", 'w') as f:
365
- json.dump([], f)
366
-
367
  log_step("βœ… Initial log files created")
368
  return True
369
 
@@ -372,136 +341,98 @@ def create_initial_logs():
372
  return False
373
 
374
 
375
- def validate_installation():
376
- """Validate that the system is properly set up"""
377
- log_step("πŸ” Validating system installation...")
378
-
379
- validation_checks = []
380
-
381
- # Check model files
382
- model_exists, model_files = check_model_exists()
383
- validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
384
-
385
- # Check data files
386
- data_exists, data_files = check_training_data_exists()
387
- validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
388
-
389
- # Check directories
390
- required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
391
- dirs_exist = all(Path(d).exists() for d in required_dirs)
392
- validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
393
-
394
- # Check logs
395
- log_exists = Path("/tmp/activity_log.json").exists()
396
- validation_checks.append(("Log Files", log_exists, "Activity log created"))
397
-
398
- # Test model loading
399
- model_loadable = False
400
- try:
401
- import joblib
402
- pipeline = joblib.load("/tmp/pipeline.pkl")
403
- test_prediction = pipeline.predict(["This is a test news article"])
404
- model_loadable = True
405
- validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
406
- except Exception as e:
407
- validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
408
-
409
- # Print validation results
410
- log_step("πŸ“‹ Validation Results:")
411
- all_passed = True
412
- for check_name, passed, details in validation_checks:
413
- status = "βœ… PASS" if passed else "❌ FAIL"
414
- log_step(f" {status} {check_name}: {details}")
415
- if not passed:
416
- all_passed = False
417
-
418
- return all_passed, validation_checks
419
-
420
-
421
  def main():
422
- """Main initialization function with smart training logic"""
423
- log_step("πŸš€ Starting intelligent system initialization...")
424
-
425
- # Check if model already exists
426
- model_exists, existing_model_files = check_model_exists()
427
 
428
- if model_exists:
429
- log_step("🎯 EXISTING INSTALLATION DETECTED")
430
- log_step("πŸ“„ Found existing model files - skipping training")
431
-
432
- # Load existing metadata to show info
433
- try:
434
- with open("/tmp/metadata.json", 'r') as f:
435
- metadata = json.load(f)
436
-
437
- log_step(f"πŸ“Š Existing Model Info:")
438
- log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
439
- log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
440
- log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
441
- log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
442
-
443
- except Exception as e:
444
- log_step(f"⚠️ Could not read existing metadata: {e}")
445
 
 
 
 
 
 
 
 
 
 
446
  else:
447
- log_step("πŸ†• FIRST-TIME INSTALLATION DETECTED")
448
- log_step("πŸ”§ No existing model found - will train new model")
449
-
450
- # Run initialization steps
451
- steps = [
452
- ("Directory Creation", create_directories),
453
- ("Dataset Copy", copy_original_datasets),
454
- ("Dataset Preparation", create_minimal_dataset),
455
- ("Log Creation", create_initial_logs)
456
- ]
457
-
458
- # Add training step only if model doesn't exist
459
- if not model_exists:
460
- steps.insert(-1, ("πŸ€– Model Training", run_initial_training))
461
 
462
  failed_steps = []
 
463
 
464
- for step_name, step_function in steps:
 
 
465
  try:
466
- log_step(f"▢️ Starting: {step_name}")
467
  if step_function():
468
- log_step(f"βœ… {step_name} completed")
469
  else:
470
  log_step(f"❌ {step_name} failed")
471
  failed_steps.append(step_name)
472
  except Exception as e:
473
- log_step(f"❌ {step_name} failed: {str(e)}")
474
  failed_steps.append(step_name)
475
 
476
- # Final validation
477
- log_step("πŸ” Running final system validation...")
478
- validation_passed, validation_results = validate_installation()
479
-
480
- # Summary
481
- log_step("=" * 60)
482
  if failed_steps:
483
- log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
484
- log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
485
- else:
486
- log_step("πŸŽ‰ System initialization completed successfully!")
487
-
488
- if validation_passed:
489
- log_step("βœ… All validation checks passed!")
490
- log_step("πŸš€ System is ready for use!")
491
 
492
- if not model_exists:
493
- log_step("πŸ€– NEW MODEL TRAINED AND READY")
494
- log_step("πŸ“Š You can now start making predictions!")
495
  else:
496
- log_step("πŸ”„ EXISTING MODEL VALIDATED AND READY")
497
- log_step("πŸ“Š System restored from previous installation!")
498
 
499
  else:
500
- log_step("❌ Some validation checks failed")
501
- log_step("πŸ”§ Manual intervention may be required")
502
-
503
- log_step("=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
 
506
  if __name__ == "__main__":
 
 
507
  main()
 
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
+ def check_existing_model():
16
+ """Check if a complete model setup already exists"""
17
+ log_step("Checking for existing model setup...")
18
+
19
+ critical_files = [
20
+ "/tmp/model.pkl",
21
+ "/tmp/vectorizer.pkl",
22
+ "/tmp/metadata.json"
23
  ]
24
 
25
+ # Check if all critical files exist
26
+ existing_files = []
27
+ missing_files = []
28
 
29
+ for file_path in critical_files:
30
+ if Path(file_path).exists():
31
+ existing_files.append(file_path)
32
+ else:
33
+ missing_files.append(file_path)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Also check for pipeline (new format)
36
+ pipeline_path = Path("/tmp/pipeline.pkl")
37
+ if pipeline_path.exists():
38
+ existing_files.append(str(pipeline_path))
39
 
40
+ if len(existing_files) >= 2: # At least model + vectorizer OR pipeline + metadata
41
+ log_step(f"βœ… Found existing model setup: {len(existing_files)} files")
42
+ for file_path in existing_files:
43
+ file_size = Path(file_path).stat().st_size if Path(file_path).exists() else 0
44
+ log_step(f" πŸ“ {file_path} ({file_size:,} bytes)")
45
+
46
+ # Check if metadata shows when it was last trained
47
+ try:
48
+ metadata_path = Path("/tmp/metadata.json")
49
+ if metadata_path.exists():
50
+ with open(metadata_path, 'r') as f:
51
+ metadata = json.load(f)
52
+
53
+ last_trained = metadata.get('timestamp', 'Unknown')
54
+ model_version = metadata.get('model_version', 'Unknown')
55
+ accuracy = metadata.get('test_accuracy', 'Unknown')
56
+
57
+ log_step(f" 🎯 Model Version: {model_version}")
58
+ log_step(f" πŸ“Š Accuracy: {accuracy}")
59
+ log_step(f" πŸ•’ Last Trained: {last_trained}")
60
+
61
+ except Exception as e:
62
+ log_step(f" ⚠️ Could not read metadata: {e}")
63
+
64
+ return True
65
  else:
66
+ log_step(f"❌ Incomplete model setup found")
67
+ log_step(f" Existing: {existing_files}")
68
+ log_step(f" Missing: {missing_files}")
69
+ return False
70
 
71
 
72
  def create_directories():
 
75
 
76
  directories = [
77
  "/tmp/data",
 
78
  "/tmp/model",
79
+ "/tmp/logs"
 
 
80
  ]
81
 
82
  for dir_path in directories:
 
90
 
91
  source_files = [
92
  ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
93
+ ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
94
+ ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
 
 
 
95
  ]
96
 
97
  copied_count = 0
 
109
 
110
  def create_minimal_dataset():
111
  """Create a minimal dataset if original doesn't exist"""
112
+ log_step("Checking for training dataset...")
113
 
114
  combined_path = Path("/tmp/data/combined_dataset.csv")
115
 
116
  if combined_path.exists():
117
+ # Check dataset size
118
+ df = pd.read_csv(combined_path)
119
+ log_step(f"βœ… Found existing dataset with {len(df)} samples")
120
  return True
121
 
122
+ log_step("Creating minimal fallback dataset...")
123
+
124
+ # Create minimal training data with better examples
125
  minimal_data = pd.DataFrame({
126
  'text': [
127
+ # Real news examples
128
+ 'Scientists at MIT develop new renewable energy technology that could revolutionize solar power generation',
129
+ 'Federal Reserve announces interest rate decision following economic data review by board members',
130
+ 'Local hospital receives grant funding to expand emergency care services for rural communities',
131
+ 'University researchers publish peer-reviewed study on climate change impact in Nature journal',
132
+ 'City council approves new infrastructure project to improve public transportation accessibility',
133
+ 'Technology company reports quarterly earnings beating analyst expectations amid market uncertainty',
134
+ 'International health organization releases guidelines for pandemic preparedness protocols',
135
+ 'Archaeological team discovers ancient artifacts providing insights into historical civilization',
136
+ 'Education department announces new funding for STEM programs in underserved school districts',
137
+ 'Environmental agency implements new regulations to protect endangered species habitats',
 
 
 
 
 
138
 
139
+ # Fake news examples
140
+ 'SHOCKING: Government admits to hiding alien technology in secret underground military bases',
141
+ 'BREAKING: Miracle cure discovered that doctors dont want you to know about eliminates all diseases',
142
+ 'EXCLUSIVE: Celebrity reveals how eating this one weird fruit helped them lose 50 pounds overnight',
143
+ 'URGENT: New world order conspiracy exposed through leaked documents from anonymous whistleblower',
144
+ 'ALERT: Scientists confirm that 5G towers are controlling peoples minds through radio frequencies',
145
+ 'REVEALED: Ancient pyramid discovered in Antarctica proves existence of lost advanced civilization',
146
+ 'WARNING: Vaccination campaign is actually secret government plot to implant tracking microchips',
147
+ 'EXPOSED: Time travel technology has been perfected by shadow government organization since 1960s',
148
+ 'CONFIRMED: Flat earth society presents undeniable proof that NASA has been lying about space',
149
+ 'INCREDIBLE: Man discovers how to predict lottery numbers using this simple mathematical formula'
 
 
 
 
 
150
  ],
151
+ 'label': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # Real news (first 10)
152
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # Fake news (last 10)
 
 
 
 
153
  })
154
 
155
  minimal_data.to_csv(combined_path, index=False)
156
+ log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples")
 
 
157
  return True
158
 
159
 
160
+ def run_comprehensive_training():
161
+ """Run comprehensive model training with pipeline"""
162
+ log_step("πŸš€ Starting comprehensive model training...")
163
 
164
  try:
165
+ # Import required libraries
166
  from sklearn.feature_extraction.text import TfidfVectorizer
167
  from sklearn.linear_model import LogisticRegression
 
 
168
  from sklearn.pipeline import Pipeline
169
+ from sklearn.model_selection import train_test_split
170
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
171
  from sklearn.preprocessing import FunctionTransformer
 
172
  import joblib
173
  import re
174
 
175
+ # Text preprocessing function
176
  def preprocess_text_function(texts):
177
  def clean_single_text(text):
178
  text = str(text)
 
199
  df = pd.read_csv(dataset_path)
200
  log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
201
 
 
 
 
 
 
 
 
202
  # Prepare data
203
  X = df['text'].values
204
  y = df['label'].values
205
 
206
+ # Check class distribution
207
+ unique, counts = np.unique(y, return_counts=True)
208
+ log_step(f"πŸ“ˆ Class distribution: {dict(zip(unique, counts))}")
209
+
210
  # Train-test split
211
+ test_size = 0.2 if len(df) > 20 else 0.1
212
  X_train, X_test, y_train, y_test = train_test_split(
213
+ X, y, test_size=test_size, random_state=42,
214
+ stratify=y if len(np.unique(y)) > 1 else None
215
  )
216
 
217
  log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
218
 
219
+ # Create preprocessing pipeline
220
  text_preprocessor = FunctionTransformer(
221
  func=preprocess_text_function,
222
  validate=False
223
  )
224
 
225
+ # Create comprehensive pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  pipeline = Pipeline([
227
  ('preprocess', text_preprocessor),
228
+ ('vectorize', TfidfVectorizer(
229
+ max_features=5000,
230
+ min_df=1,
231
+ max_df=0.95,
232
+ ngram_range=(1, 2),
233
+ stop_words='english',
234
+ sublinear_tf=True,
235
+ norm='l2'
236
+ )),
237
+ ('model', LogisticRegression(
238
+ max_iter=1000,
239
+ random_state=42,
240
+ class_weight='balanced'
241
+ ))
242
  ])
243
 
244
+ log_step("πŸ”§ Training pipeline...")
245
+ pipeline.fit(X_train, y_train)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ # Evaluate
248
+ y_pred = pipeline.predict(X_test)
249
+
250
+ # Calculate comprehensive metrics
251
  accuracy = accuracy_score(y_test, y_pred)
252
+ precision = precision_score(y_test, y_pred, average='weighted')
253
+ recall = recall_score(y_test, y_pred, average='weighted')
254
  f1 = f1_score(y_test, y_pred, average='weighted')
255
 
256
+ log_step(f"πŸ“Š Model Performance:")
257
+ log_step(f" Accuracy: {accuracy:.4f}")
258
+ log_step(f" Precision: {precision:.4f}")
259
+ log_step(f" Recall: {recall:.4f}")
260
+ log_step(f" F1 Score: {f1:.4f}")
261
 
262
+ # Save comprehensive model setup
263
  log_step("πŸ’Ύ Saving model artifacts...")
264
+
265
+ # Save complete pipeline
266
+ joblib.dump(pipeline, "/tmp/pipeline.pkl")
267
  log_step("βœ… Saved complete pipeline")
268
 
269
+ # Save individual components for backward compatibility
270
+ joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
271
+ joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
272
+ log_step("βœ… Saved individual components")
273
 
274
  # Generate comprehensive metadata
275
  metadata = {
276
  "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
277
+ "model_type": "logistic_regression_pipeline",
 
 
 
 
278
  "test_accuracy": float(accuracy),
279
+ "test_precision": float(precision),
280
+ "test_recall": float(recall),
281
  "test_f1": float(f1),
282
+ "train_size": len(X_train),
283
+ "test_size": len(X_test),
284
+ "dataset_size": len(df),
285
+ "timestamp": datetime.now().isoformat(),
286
+ "training_method": "comprehensive_initialization",
287
+ "pipeline_components": ["preprocess", "vectorize", "model"],
288
+ "vectorizer_config": {
289
  "max_features": 5000,
290
  "ngram_range": [1, 2],
291
+ "stop_words": "english"
 
292
  },
293
+ "model_config": {
294
+ "algorithm": "LogisticRegression",
295
+ "max_iter": 1000,
296
+ "class_weight": "balanced"
297
+ }
298
  }
299
 
 
300
  with open("/tmp/metadata.json", 'w') as f:
301
  json.dump(metadata, f, indent=2)
302
 
303
  log_step("βœ… Saved comprehensive metadata")
304
+ log_step(f"πŸŽ‰ Training completed successfully!")
305
+ log_step(f" Final accuracy: {accuracy:.4f}")
306
+ log_step(f" Model ready for production use")
307
+
308
  return True
309
 
310
  except Exception as e:
 
322
  # Activity log
323
  activity_log = [{
324
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
325
+ "event": "System initialized successfully",
326
  "level": "INFO"
327
  }]
328
 
 
330
  json.dump(activity_log, f, indent=2)
331
 
332
  # Create empty monitoring logs
 
 
 
 
333
  with open("/tmp/logs/monitoring_log.json", 'w') as f:
334
  json.dump([], f)
335
 
 
 
 
336
  log_step("βœ… Initial log files created")
337
  return True
338
 
 
341
  return False
342
 
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  def main():
345
+ """Main initialization function with smart model training"""
346
+ log_step("πŸš€ Starting smart system initialization...")
 
 
 
347
 
348
+ # First, check if we already have a working model
349
+ has_existing_model = check_existing_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
+ # Define steps based on whether model exists
352
+ if has_existing_model:
353
+ log_step("🎯 Existing model detected - skipping training")
354
+ steps = [
355
+ ("Directory Creation", create_directories),
356
+ ("Dataset Copy", copy_original_datasets),
357
+ ("Dataset Validation", create_minimal_dataset),
358
+ ("Log Creation", create_initial_logs)
359
+ ]
360
  else:
361
+ log_step("πŸ†• No existing model - will perform first-time setup with training")
362
+ steps = [
363
+ ("Directory Creation", create_directories),
364
+ ("Dataset Copy", copy_original_datasets),
365
+ ("Dataset Preparation", create_minimal_dataset),
366
+ ("Model Training", run_comprehensive_training),
367
+ ("Log Creation", create_initial_logs)
368
+ ]
 
 
 
 
 
 
369
 
370
  failed_steps = []
371
+ total_steps = len(steps)
372
 
373
+ for i, (step_name, step_function) in enumerate(steps, 1):
374
+ log_step(f"πŸ“‹ Step {i}/{total_steps}: {step_name}")
375
+
376
  try:
 
377
  if step_function():
378
+ log_step(f"βœ… {step_name} completed successfully")
379
  else:
380
  log_step(f"❌ {step_name} failed")
381
  failed_steps.append(step_name)
382
  except Exception as e:
383
+ log_step(f"❌ {step_name} failed with exception: {str(e)}")
384
  failed_steps.append(step_name)
385
 
386
+ # Final summary
387
+ log_step("=" * 50)
 
 
 
 
388
  if failed_steps:
389
+ log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
390
+ log_step(f"Failed steps: {', '.join(failed_steps)}")
 
 
 
 
 
 
391
 
392
+ # Check if critical components are still available
393
+ if check_existing_model():
394
+ log_step("βœ… Critical model components are available despite some failures")
395
  else:
396
+ log_step("❌ Critical model components are missing - system may not work properly")
 
397
 
398
  else:
399
+ if has_existing_model:
400
+ log_step("πŸŽ‰ System initialization completed successfully!")
401
+ log_step("πŸš€ Existing model loaded - system ready for immediate use!")
402
+ else:
403
+ log_step("πŸŽ‰ First-time setup completed successfully!")
404
+ log_step("πŸš€ Model trained and system ready for use!")
405
+
406
+ # Final status check
407
+ log_step("πŸ“Š Final System Status:")
408
+ critical_files = [
409
+ ("/tmp/pipeline.pkl", "Complete Pipeline"),
410
+ ("/tmp/model.pkl", "Model Component"),
411
+ ("/tmp/vectorizer.pkl", "Vectorizer Component"),
412
+ ("/tmp/metadata.json", "Model Metadata"),
413
+ ("/tmp/data/combined_dataset.csv", "Training Dataset")
414
+ ]
415
+
416
+ ready_count = 0
417
+ for file_path, description in critical_files:
418
+ if Path(file_path).exists():
419
+ file_size = Path(file_path).stat().st_size
420
+ log_step(f" βœ… {description}: {file_size:,} bytes")
421
+ ready_count += 1
422
+ else:
423
+ log_step(f" ❌ {description}: Missing")
424
+
425
+ log_step(f"πŸ“ˆ System Readiness: {ready_count}/{len(critical_files)} components available")
426
+
427
+ if ready_count >= 3: # At least model + vectorizer + metadata OR pipeline + metadata
428
+ log_step("🎯 System is ready for production use!")
429
+ else:
430
+ log_step("⚠️ System setup incomplete - may require manual intervention")
431
+
432
+ log_step("=" * 50)
433
 
434
 
435
  if __name__ == "__main__":
436
+ # Add numpy import for the training function
437
+ import numpy as np
438
  main()