Ahmedik95316 commited on
Commit
5bb1d1a
Β·
1 Parent(s): 4a1bc0d

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +296 -227
initialize_system.py CHANGED
@@ -12,61 +12,42 @@ def log_step(message):
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
- def check_existing_model():
16
- """Check if a complete model setup already exists"""
17
- log_step("Checking for existing model setup...")
18
-
19
- critical_files = [
20
- "/tmp/model.pkl",
21
- "/tmp/vectorizer.pkl",
22
- "/tmp/metadata.json"
23
  ]
24
 
25
- # Check if all critical files exist
26
- existing_files = []
27
- missing_files = []
28
 
29
- for file_path in critical_files:
30
- if Path(file_path).exists():
31
- existing_files.append(file_path)
32
- else:
33
- missing_files.append(file_path)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Also check for pipeline (new format)
36
- pipeline_path = Path("/tmp/pipeline.pkl")
37
- if pipeline_path.exists():
38
- existing_files.append(str(pipeline_path))
39
 
40
- if len(existing_files) >= 2: # At least model + vectorizer OR pipeline + metadata
41
- log_step(f"βœ… Found existing model setup: {len(existing_files)} files")
42
- for file_path in existing_files:
43
- file_size = Path(file_path).stat().st_size if Path(file_path).exists() else 0
44
- log_step(f" πŸ“ {file_path} ({file_size:,} bytes)")
45
-
46
- # Check if metadata shows when it was last trained
47
- try:
48
- metadata_path = Path("/tmp/metadata.json")
49
- if metadata_path.exists():
50
- with open(metadata_path, 'r') as f:
51
- metadata = json.load(f)
52
-
53
- last_trained = metadata.get('timestamp', 'Unknown')
54
- model_version = metadata.get('model_version', 'Unknown')
55
- accuracy = metadata.get('test_accuracy', 'Unknown')
56
-
57
- log_step(f" 🎯 Model Version: {model_version}")
58
- log_step(f" πŸ“Š Accuracy: {accuracy}")
59
- log_step(f" πŸ•’ Last Trained: {last_trained}")
60
-
61
- except Exception as e:
62
- log_step(f" ⚠️ Could not read metadata: {e}")
63
-
64
- return True
65
  else:
66
- log_step(f"❌ Incomplete model setup found")
67
- log_step(f" Existing: {existing_files}")
68
- log_step(f" Missing: {missing_files}")
69
- return False
70
 
71
 
72
  def create_directories():
@@ -75,8 +56,11 @@ def create_directories():
75
 
76
  directories = [
77
  "/tmp/data",
 
78
  "/tmp/model",
79
- "/tmp/logs"
 
 
80
  ]
81
 
82
  for dir_path in directories:
@@ -90,8 +74,11 @@ def copy_original_datasets():
90
 
91
  source_files = [
92
  ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
93
- ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
94
- ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
 
 
 
95
  ]
96
 
97
  copied_count = 0
@@ -109,70 +96,84 @@ def copy_original_datasets():
109
 
110
  def create_minimal_dataset():
111
  """Create a minimal dataset if original doesn't exist"""
112
- log_step("Checking for training dataset...")
113
 
114
  combined_path = Path("/tmp/data/combined_dataset.csv")
115
 
116
  if combined_path.exists():
117
- # Check dataset size
118
- df = pd.read_csv(combined_path)
119
- log_step(f"βœ… Found existing dataset with {len(df)} samples")
120
  return True
121
 
122
- log_step("Creating minimal fallback dataset...")
123
-
124
- # Create minimal training data with better examples
125
  minimal_data = pd.DataFrame({
126
  'text': [
127
- # Real news examples
128
- 'Scientists at MIT develop new renewable energy technology that could revolutionize solar power generation',
129
- 'Federal Reserve announces interest rate decision following economic data review by board members',
130
- 'Local hospital receives grant funding to expand emergency care services for rural communities',
131
- 'University researchers publish peer-reviewed study on climate change impact in Nature journal',
132
- 'City council approves new infrastructure project to improve public transportation accessibility',
133
- 'Technology company reports quarterly earnings beating analyst expectations amid market uncertainty',
134
- 'International health organization releases guidelines for pandemic preparedness protocols',
135
- 'Archaeological team discovers ancient artifacts providing insights into historical civilization',
136
- 'Education department announces new funding for STEM programs in underserved school districts',
137
- 'Environmental agency implements new regulations to protect endangered species habitats',
 
 
 
 
 
138
 
139
- # Fake news examples
140
- 'SHOCKING: Government admits to hiding alien technology in secret underground military bases',
141
- 'BREAKING: Miracle cure discovered that doctors dont want you to know about eliminates all diseases',
142
- 'EXCLUSIVE: Celebrity reveals how eating this one weird fruit helped them lose 50 pounds overnight',
143
- 'URGENT: New world order conspiracy exposed through leaked documents from anonymous whistleblower',
144
- 'ALERT: Scientists confirm that 5G towers are controlling peoples minds through radio frequencies',
145
- 'REVEALED: Ancient pyramid discovered in Antarctica proves existence of lost advanced civilization',
146
- 'WARNING: Vaccination campaign is actually secret government plot to implant tracking microchips',
147
- 'EXPOSED: Time travel technology has been perfected by shadow government organization since 1960s',
148
- 'CONFIRMED: Flat earth society presents undeniable proof that NASA has been lying about space',
149
- 'INCREDIBLE: Man discovers how to predict lottery numbers using this simple mathematical formula'
 
 
 
 
 
150
  ],
151
- 'label': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # Real news (first 10)
152
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # Fake news (last 10)
 
 
 
 
153
  })
154
 
155
  minimal_data.to_csv(combined_path, index=False)
156
- log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples")
 
 
157
  return True
158
 
159
 
160
- def run_comprehensive_training():
161
- """Run comprehensive model training with pipeline"""
162
- log_step("πŸš€ Starting comprehensive model training...")
163
 
164
  try:
165
- # Import required libraries
166
  from sklearn.feature_extraction.text import TfidfVectorizer
167
  from sklearn.linear_model import LogisticRegression
 
 
168
  from sklearn.pipeline import Pipeline
169
- from sklearn.model_selection import train_test_split
170
- from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
171
  from sklearn.preprocessing import FunctionTransformer
 
172
  import joblib
173
  import re
174
 
175
- # Text preprocessing function
176
  def preprocess_text_function(texts):
177
  def clean_single_text(text):
178
  text = str(text)
@@ -199,112 +200,135 @@ def run_comprehensive_training():
199
  df = pd.read_csv(dataset_path)
200
  log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
201
 
 
 
 
 
 
 
 
202
  # Prepare data
203
  X = df['text'].values
204
  y = df['label'].values
205
 
206
- # Check class distribution
207
- unique, counts = np.unique(y, return_counts=True)
208
- log_step(f"πŸ“ˆ Class distribution: {dict(zip(unique, counts))}")
209
-
210
  # Train-test split
211
- test_size = 0.2 if len(df) > 20 else 0.1
212
  X_train, X_test, y_train, y_test = train_test_split(
213
- X, y, test_size=test_size, random_state=42,
214
- stratify=y if len(np.unique(y)) > 1 else None
215
  )
216
 
217
  log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
218
 
219
- # Create preprocessing pipeline
220
  text_preprocessor = FunctionTransformer(
221
  func=preprocess_text_function,
222
  validate=False
223
  )
224
 
225
- # Create comprehensive pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  pipeline = Pipeline([
227
  ('preprocess', text_preprocessor),
228
- ('vectorize', TfidfVectorizer(
229
- max_features=5000,
230
- min_df=1,
231
- max_df=0.95,
232
- ngram_range=(1, 2),
233
- stop_words='english',
234
- sublinear_tf=True,
235
- norm='l2'
236
- )),
237
- ('model', LogisticRegression(
238
- max_iter=1000,
239
- random_state=42,
240
- class_weight='balanced'
241
- ))
242
  ])
243
 
244
- log_step("πŸ”§ Training pipeline...")
245
- pipeline.fit(X_train, y_train)
246
 
247
- # Evaluate
248
- y_pred = pipeline.predict(X_test)
249
-
250
- # Calculate comprehensive metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  accuracy = accuracy_score(y_test, y_pred)
252
- precision = precision_score(y_test, y_pred, average='weighted')
253
- recall = recall_score(y_test, y_pred, average='weighted')
254
  f1 = f1_score(y_test, y_pred, average='weighted')
255
 
256
- log_step(f"πŸ“Š Model Performance:")
257
- log_step(f" Accuracy: {accuracy:.4f}")
258
- log_step(f" Precision: {precision:.4f}")
259
- log_step(f" Recall: {recall:.4f}")
260
- log_step(f" F1 Score: {f1:.4f}")
261
 
262
- # Save comprehensive model setup
263
  log_step("πŸ’Ύ Saving model artifacts...")
264
-
265
- # Save complete pipeline
266
- joblib.dump(pipeline, "/tmp/pipeline.pkl")
267
  log_step("βœ… Saved complete pipeline")
268
 
269
- # Save individual components for backward compatibility
270
- joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
271
- joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
272
- log_step("βœ… Saved individual components")
273
 
274
  # Generate comprehensive metadata
275
  metadata = {
276
  "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
277
- "model_type": "logistic_regression_pipeline",
278
- "test_accuracy": float(accuracy),
279
- "test_precision": float(precision),
280
- "test_recall": float(recall),
281
- "test_f1": float(f1),
282
  "train_size": len(X_train),
283
  "test_size": len(X_test),
284
- "dataset_size": len(df),
285
- "timestamp": datetime.now().isoformat(),
286
- "training_method": "comprehensive_initialization",
287
- "pipeline_components": ["preprocess", "vectorize", "model"],
288
- "vectorizer_config": {
 
289
  "max_features": 5000,
290
  "ngram_range": [1, 2],
291
- "stop_words": "english"
 
292
  },
293
- "model_config": {
294
- "algorithm": "LogisticRegression",
295
- "max_iter": 1000,
296
- "class_weight": "balanced"
297
- }
298
  }
299
 
 
300
  with open("/tmp/metadata.json", 'w') as f:
301
  json.dump(metadata, f, indent=2)
302
 
303
  log_step("βœ… Saved comprehensive metadata")
304
- log_step(f"πŸŽ‰ Training completed successfully!")
305
- log_step(f" Final accuracy: {accuracy:.4f}")
306
- log_step(f" Model ready for production use")
307
-
308
  return True
309
 
310
  except Exception as e:
@@ -322,7 +346,7 @@ def create_initial_logs():
322
  # Activity log
323
  activity_log = [{
324
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
325
- "event": "System initialized successfully",
326
  "level": "INFO"
327
  }]
328
 
@@ -330,9 +354,16 @@ def create_initial_logs():
330
  json.dump(activity_log, f, indent=2)
331
 
332
  # Create empty monitoring logs
 
 
 
 
333
  with open("/tmp/logs/monitoring_log.json", 'w') as f:
334
  json.dump([], f)
335
 
 
 
 
336
  log_step("βœ… Initial log files created")
337
  return True
338
 
@@ -341,98 +372,136 @@ def create_initial_logs():
341
  return False
342
 
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  def main():
345
- """Main initialization function with smart model training"""
346
- log_step("πŸš€ Starting smart system initialization...")
 
 
 
347
 
348
- # First, check if we already have a working model
349
- has_existing_model = check_existing_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
- # Define steps based on whether model exists
352
- if has_existing_model:
353
- log_step("🎯 Existing model detected - skipping training")
354
- steps = [
355
- ("Directory Creation", create_directories),
356
- ("Dataset Copy", copy_original_datasets),
357
- ("Dataset Validation", create_minimal_dataset),
358
- ("Log Creation", create_initial_logs)
359
- ]
360
  else:
361
- log_step("πŸ†• No existing model - will perform first-time setup with training")
362
- steps = [
363
- ("Directory Creation", create_directories),
364
- ("Dataset Copy", copy_original_datasets),
365
- ("Dataset Preparation", create_minimal_dataset),
366
- ("Model Training", run_comprehensive_training),
367
- ("Log Creation", create_initial_logs)
368
- ]
 
 
 
 
 
 
369
 
370
  failed_steps = []
371
- total_steps = len(steps)
372
 
373
- for i, (step_name, step_function) in enumerate(steps, 1):
374
- log_step(f"πŸ“‹ Step {i}/{total_steps}: {step_name}")
375
-
376
  try:
 
377
  if step_function():
378
- log_step(f"βœ… {step_name} completed successfully")
379
  else:
380
  log_step(f"❌ {step_name} failed")
381
  failed_steps.append(step_name)
382
  except Exception as e:
383
- log_step(f"❌ {step_name} failed with exception: {str(e)}")
384
  failed_steps.append(step_name)
385
 
386
- # Final summary
387
- log_step("=" * 50)
 
 
 
 
388
  if failed_steps:
389
- log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
390
- log_step(f"Failed steps: {', '.join(failed_steps)}")
 
 
 
 
 
 
391
 
392
- # Check if critical components are still available
393
- if check_existing_model():
394
- log_step("βœ… Critical model components are available despite some failures")
395
  else:
396
- log_step("❌ Critical model components are missing - system may not work properly")
 
397
 
398
  else:
399
- if has_existing_model:
400
- log_step("πŸŽ‰ System initialization completed successfully!")
401
- log_step("πŸš€ Existing model loaded - system ready for immediate use!")
402
- else:
403
- log_step("πŸŽ‰ First-time setup completed successfully!")
404
- log_step("πŸš€ Model trained and system ready for use!")
405
-
406
- # Final status check
407
- log_step("πŸ“Š Final System Status:")
408
- critical_files = [
409
- ("/tmp/pipeline.pkl", "Complete Pipeline"),
410
- ("/tmp/model.pkl", "Model Component"),
411
- ("/tmp/vectorizer.pkl", "Vectorizer Component"),
412
- ("/tmp/metadata.json", "Model Metadata"),
413
- ("/tmp/data/combined_dataset.csv", "Training Dataset")
414
- ]
415
-
416
- ready_count = 0
417
- for file_path, description in critical_files:
418
- if Path(file_path).exists():
419
- file_size = Path(file_path).stat().st_size
420
- log_step(f" βœ… {description}: {file_size:,} bytes")
421
- ready_count += 1
422
- else:
423
- log_step(f" ❌ {description}: Missing")
424
-
425
- log_step(f"πŸ“ˆ System Readiness: {ready_count}/{len(critical_files)} components available")
426
-
427
- if ready_count >= 3: # At least model + vectorizer + metadata OR pipeline + metadata
428
- log_step("🎯 System is ready for production use!")
429
- else:
430
- log_step("⚠️ System setup incomplete - may require manual intervention")
431
-
432
- log_step("=" * 50)
433
 
434
 
435
  if __name__ == "__main__":
436
- # Add numpy import for the training function
437
- import numpy as np
438
  main()
 
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
+ def check_model_exists():
16
+ """Check if trained model already exists"""
17
+ model_files = [
18
+ Path("/tmp/pipeline.pkl"),
19
+ Path("/tmp/model.pkl"),
20
+ Path("/tmp/vectorizer.pkl"),
21
+ Path("/tmp/metadata.json")
 
22
  ]
23
 
24
+ existing_files = [f for f in model_files if f.exists()]
 
 
25
 
26
+ if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
27
+ log_step(f"βœ… Found {len(existing_files)} existing model files")
28
+ return True, existing_files
29
+ else:
30
+ log_step(f"❌ Missing model files - only found {len(existing_files)}")
31
+ return False, existing_files
32
+
33
+
34
+ def check_training_data_exists():
35
+ """Check if training data is available"""
36
+ data_files = [
37
+ Path("/tmp/data/combined_dataset.csv"),
38
+ Path("/app/data/combined_dataset.csv"),
39
+ Path("/tmp/data/kaggle/Fake.csv"),
40
+ Path("/tmp/data/kaggle/True.csv")
41
+ ]
42
 
43
+ existing_data = [f for f in data_files if f.exists()]
 
 
 
44
 
45
+ if existing_data:
46
+ log_step(f"βœ… Found training data: {[str(f) for f in existing_data]}")
47
+ return True, existing_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  else:
49
+ log_step("❌ No training data found")
50
+ return False, []
 
 
51
 
52
 
53
  def create_directories():
 
56
 
57
  directories = [
58
  "/tmp/data",
59
+ "/tmp/data/kaggle",
60
  "/tmp/model",
61
+ "/tmp/logs",
62
+ "/tmp/results",
63
+ "/tmp/backups"
64
  ]
65
 
66
  for dir_path in directories:
 
74
 
75
  source_files = [
76
  ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
77
+ ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
78
+ ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
79
+ ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
80
+ ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
81
+ ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
82
  ]
83
 
84
  copied_count = 0
 
96
 
97
  def create_minimal_dataset():
98
  """Create a minimal dataset if original doesn't exist"""
99
+ log_step("Creating minimal dataset...")
100
 
101
  combined_path = Path("/tmp/data/combined_dataset.csv")
102
 
103
  if combined_path.exists():
104
+ log_step("βœ… Combined dataset already exists")
 
 
105
  return True
106
 
107
+ # Create minimal training data with more samples for better training
 
 
108
  minimal_data = pd.DataFrame({
109
  'text': [
110
+ # Real news samples
111
+ 'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
112
+ 'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
113
+ 'Local authorities report significant improvements in air quality following new environmental regulations',
114
+ 'Research published in Nature journal shows promising results for renewable energy storage technology',
115
+ 'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
116
+ 'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
117
+ 'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
118
+ 'Transportation department announces infrastructure improvements to major highways across the region',
119
+ 'Educational institutions implement new digital learning platforms to enhance student engagement',
120
+ 'Agricultural studies reveal improved crop yields through sustainable farming practices',
121
+ 'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
122
+ 'Municipal government approves budget for public transportation expansion project in urban areas',
123
+ 'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
124
+ 'International trade agreements show positive impact on local businesses and job creation',
125
+ 'Environmental protection agency releases report on water quality improvements in major rivers',
126
 
127
+ # Fake news samples
128
+ 'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
129
+ 'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
130
+ 'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
131
+ 'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
132
+ 'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
133
+ 'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
134
+ 'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
135
+ 'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
136
+ 'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
137
+ 'WARNING: New technology allows complete thought reading through WiFi signals in your home',
138
+ 'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
139
+ 'UNCOVERED: All news media controlled by single person living in secret underground bunker',
140
+ 'PROOF: Time travel already exists but only available to wealthy elite who control world events',
141
+ 'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
142
+ 'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
143
  ],
144
+ 'label': [
145
+ # Real news labels (0)
146
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147
+ # Fake news labels (1)
148
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
149
+ ]
150
  })
151
 
152
  minimal_data.to_csv(combined_path, index=False)
153
+ log_step(f"βœ… Created enhanced minimal dataset with {len(minimal_data)} samples")
154
+ log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
155
+ log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
156
  return True
157
 
158
 
159
+ def run_initial_training():
160
+ """Run comprehensive model training for first-time setup"""
161
+ log_step("πŸš€ Starting comprehensive model training for first-time setup...")
162
 
163
  try:
164
+ # Import training modules
165
  from sklearn.feature_extraction.text import TfidfVectorizer
166
  from sklearn.linear_model import LogisticRegression
167
+ from sklearn.ensemble import RandomForestClassifier
168
+ from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
169
  from sklearn.pipeline import Pipeline
170
+ from sklearn.feature_selection import SelectKBest, chi2
 
171
  from sklearn.preprocessing import FunctionTransformer
172
+ from sklearn.metrics import accuracy_score, f1_score, classification_report
173
  import joblib
174
  import re
175
 
176
+ # Text preprocessing function (same as in train.py)
177
  def preprocess_text_function(texts):
178
  def clean_single_text(text):
179
  text = str(text)
 
200
  df = pd.read_csv(dataset_path)
201
  log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
202
 
203
+ # Data validation and cleaning
204
+ df = df.dropna(subset=['text', 'label'])
205
+ df = df[df['text'].astype(str).str.len() > 10]
206
+
207
+ log_step(f"πŸ“Š After cleaning: {len(df)} samples")
208
+ log_step(f"πŸ“Š Class distribution: {df['label'].value_counts().to_dict()}")
209
+
210
  # Prepare data
211
  X = df['text'].values
212
  y = df['label'].values
213
 
 
 
 
 
214
  # Train-test split
 
215
  X_train, X_test, y_train, y_test = train_test_split(
216
+ X, y, test_size=0.2, random_state=42, stratify=y
 
217
  )
218
 
219
  log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
220
 
221
+ # Create comprehensive pipeline
222
  text_preprocessor = FunctionTransformer(
223
  func=preprocess_text_function,
224
  validate=False
225
  )
226
 
227
+ vectorizer = TfidfVectorizer(
228
+ max_features=5000,
229
+ min_df=1,
230
+ max_df=0.95,
231
+ ngram_range=(1, 2),
232
+ stop_words='english',
233
+ sublinear_tf=True,
234
+ norm='l2'
235
+ )
236
+
237
+ feature_selector = SelectKBest(
238
+ score_func=chi2,
239
+ k=2000
240
+ )
241
+
242
+ # Create pipeline with Logistic Regression
243
  pipeline = Pipeline([
244
  ('preprocess', text_preprocessor),
245
+ ('vectorize', vectorizer),
246
+ ('feature_select', feature_selector),
247
+ ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
 
 
 
 
 
 
 
 
 
 
 
248
  ])
249
 
250
+ log_step("πŸ”§ Training model with optimized pipeline...")
 
251
 
252
+ # Hyperparameter tuning for datasets with sufficient samples
253
+ if len(X_train) >= 20:
254
+ log_step("βš™οΈ Performing hyperparameter tuning...")
255
+ param_grid = {
256
+ 'model__C': [0.1, 1, 10],
257
+ 'model__penalty': ['l2']
258
+ }
259
+
260
+ cv_folds = max(2, min(3, len(X_train) // 10))
261
+ grid_search = GridSearchCV(
262
+ pipeline,
263
+ param_grid,
264
+ cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
265
+ scoring='f1_weighted',
266
+ n_jobs=1
267
+ )
268
+
269
+ grid_search.fit(X_train, y_train)
270
+ best_pipeline = grid_search.best_estimator_
271
+
272
+ log_step(f"βœ… Best parameters: {grid_search.best_params_}")
273
+ log_step(f"βœ… Best CV score: {grid_search.best_score_:.4f}")
274
+ else:
275
+ log_step("βš™οΈ Using simple training for small dataset...")
276
+ pipeline.fit(X_train, y_train)
277
+ best_pipeline = pipeline
278
+
279
+ # Evaluate model
280
+ y_pred = best_pipeline.predict(X_test)
281
  accuracy = accuracy_score(y_test, y_pred)
 
 
282
  f1 = f1_score(y_test, y_pred, average='weighted')
283
 
284
+ log_step(f"πŸ“ˆ Model Performance:")
285
+ log_step(f" - Accuracy: {accuracy:.4f}")
286
+ log_step(f" - F1 Score: {f1:.4f}")
 
 
287
 
288
+ # Save model artifacts
289
  log_step("πŸ’Ύ Saving model artifacts...")
290
+
291
+ # Save the complete pipeline
292
+ joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
293
  log_step("βœ… Saved complete pipeline")
294
 
295
+ # Save individual components for compatibility
296
+ joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
297
+ joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
298
+ log_step("βœ… Saved individual model components")
299
 
300
  # Generate comprehensive metadata
301
  metadata = {
302
  "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
303
+ "model_type": "logistic_regression",
304
+ "training_method": "initial_setup",
305
+ "dataset_size": len(df),
 
 
306
  "train_size": len(X_train),
307
  "test_size": len(X_test),
308
+ "test_accuracy": float(accuracy),
309
+ "test_f1": float(f1),
310
+ "hyperparameter_tuning": len(X_train) >= 20,
311
+ "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
312
+ "class_distribution": df['label'].value_counts().to_dict(),
313
+ "training_config": {
314
  "max_features": 5000,
315
  "ngram_range": [1, 2],
316
+ "feature_selection_k": 2000,
317
+ "test_size": 0.2
318
  },
319
+ "timestamp": datetime.now().isoformat(),
320
+ "initialization_notes": "Model trained during system initialization",
321
+ "ready_for_production": True
 
 
322
  }
323
 
324
+ # Save metadata
325
  with open("/tmp/metadata.json", 'w') as f:
326
  json.dump(metadata, f, indent=2)
327
 
328
  log_step("βœ… Saved comprehensive metadata")
329
+ log_step(f"πŸŽ‰ Initial model training completed successfully!")
330
+ log_step(f"πŸ“Š Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
331
+
 
332
  return True
333
 
334
  except Exception as e:
 
346
  # Activity log
347
  activity_log = [{
348
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
349
+ "event": "System initialized successfully with trained model",
350
  "level": "INFO"
351
  }]
352
 
 
354
  json.dump(activity_log, f, indent=2)
355
 
356
  # Create empty monitoring logs
357
+ log_dirs = ["/tmp/logs"]
358
+ for log_dir in log_dirs:
359
+ Path(log_dir).mkdir(parents=True, exist_ok=True)
360
+
361
  with open("/tmp/logs/monitoring_log.json", 'w') as f:
362
  json.dump([], f)
363
 
364
+ with open("/tmp/logs/scheduler_execution.json", 'w') as f:
365
+ json.dump([], f)
366
+
367
  log_step("βœ… Initial log files created")
368
  return True
369
 
 
372
  return False
373
 
374
 
375
+ def validate_installation():
376
+ """Validate that the system is properly set up"""
377
+ log_step("πŸ” Validating system installation...")
378
+
379
+ validation_checks = []
380
+
381
+ # Check model files
382
+ model_exists, model_files = check_model_exists()
383
+ validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
384
+
385
+ # Check data files
386
+ data_exists, data_files = check_training_data_exists()
387
+ validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
388
+
389
+ # Check directories
390
+ required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
391
+ dirs_exist = all(Path(d).exists() for d in required_dirs)
392
+ validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
393
+
394
+ # Check logs
395
+ log_exists = Path("/tmp/activity_log.json").exists()
396
+ validation_checks.append(("Log Files", log_exists, "Activity log created"))
397
+
398
+ # Test model loading
399
+ model_loadable = False
400
+ try:
401
+ import joblib
402
+ pipeline = joblib.load("/tmp/pipeline.pkl")
403
+ test_prediction = pipeline.predict(["This is a test news article"])
404
+ model_loadable = True
405
+ validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
406
+ except Exception as e:
407
+ validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
408
+
409
+ # Print validation results
410
+ log_step("πŸ“‹ Validation Results:")
411
+ all_passed = True
412
+ for check_name, passed, details in validation_checks:
413
+ status = "βœ… PASS" if passed else "❌ FAIL"
414
+ log_step(f" {status} {check_name}: {details}")
415
+ if not passed:
416
+ all_passed = False
417
+
418
+ return all_passed, validation_checks
419
+
420
+
421
  def main():
422
+ """Main initialization function with smart training logic"""
423
+ log_step("πŸš€ Starting intelligent system initialization...")
424
+
425
+ # Check if model already exists
426
+ model_exists, existing_model_files = check_model_exists()
427
 
428
+ if model_exists:
429
+ log_step("🎯 EXISTING INSTALLATION DETECTED")
430
+ log_step("πŸ“„ Found existing model files - skipping training")
431
+
432
+ # Load existing metadata to show info
433
+ try:
434
+ with open("/tmp/metadata.json", 'r') as f:
435
+ metadata = json.load(f)
436
+
437
+ log_step(f"πŸ“Š Existing Model Info:")
438
+ log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
439
+ log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
440
+ log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
441
+ log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
442
+
443
+ except Exception as e:
444
+ log_step(f"⚠️ Could not read existing metadata: {e}")
445
 
 
 
 
 
 
 
 
 
 
446
  else:
447
+ log_step("πŸ†• FIRST-TIME INSTALLATION DETECTED")
448
+ log_step("πŸ”§ No existing model found - will train new model")
449
+
450
+ # Run initialization steps
451
+ steps = [
452
+ ("Directory Creation", create_directories),
453
+ ("Dataset Copy", copy_original_datasets),
454
+ ("Dataset Preparation", create_minimal_dataset),
455
+ ("Log Creation", create_initial_logs)
456
+ ]
457
+
458
+ # Add training step only if model doesn't exist
459
+ if not model_exists:
460
+ steps.insert(-1, ("πŸ€– Model Training", run_initial_training))
461
 
462
  failed_steps = []
 
463
 
464
+ for step_name, step_function in steps:
 
 
465
  try:
466
+ log_step(f"▢️ Starting: {step_name}")
467
  if step_function():
468
+ log_step(f"βœ… {step_name} completed")
469
  else:
470
  log_step(f"❌ {step_name} failed")
471
  failed_steps.append(step_name)
472
  except Exception as e:
473
+ log_step(f"❌ {step_name} failed: {str(e)}")
474
  failed_steps.append(step_name)
475
 
476
+ # Final validation
477
+ log_step("πŸ” Running final system validation...")
478
+ validation_passed, validation_results = validate_installation()
479
+
480
+ # Summary
481
+ log_step("=" * 60)
482
  if failed_steps:
483
+ log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
484
+ log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
485
+ else:
486
+ log_step("πŸŽ‰ System initialization completed successfully!")
487
+
488
+ if validation_passed:
489
+ log_step("βœ… All validation checks passed!")
490
+ log_step("πŸš€ System is ready for use!")
491
 
492
+ if not model_exists:
493
+ log_step("πŸ€– NEW MODEL TRAINED AND READY")
494
+ log_step("πŸ“Š You can now start making predictions!")
495
  else:
496
+ log_step("πŸ”„ EXISTING MODEL VALIDATED AND READY")
497
+ log_step("πŸ“Š System restored from previous installation!")
498
 
499
  else:
500
+ log_step("❌ Some validation checks failed")
501
+ log_step("πŸ”§ Manual intervention may be required")
502
+
503
+ log_step("=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
 
506
  if __name__ == "__main__":
 
 
507
  main()