Ahmedik95316 commited on
Commit
c745fee
Β·
1 Parent(s): ae5a4d8

Update initialize_system.py

Browse files

Restored previous working version

Files changed (1) hide show
  1. initialize_system.py +93 -216
initialize_system.py CHANGED
@@ -7,176 +7,108 @@ from pathlib import Path
7
  from datetime import datetime
8
 
9
 
10
- # =============================================================================
11
- # CENTRALIZED PATH CONFIGURATION - MATCHES OTHER COMPONENTS
12
- # =============================================================================
13
- class PathConfig:
14
- """Centralized path management to ensure consistency across all components"""
15
-
16
- # Environment detection
17
- if os.getenv("HF_SPACES_BUILD") == "1" or os.getenv("SPACE_ID"):
18
- BASE_DIR = Path("/app/persistent")
19
- ENVIRONMENT = "huggingface_spaces"
20
- else:
21
- BASE_DIR = Path("/tmp")
22
- ENVIRONMENT = "local"
23
-
24
- # Base directories
25
- DATA_DIR = BASE_DIR / "data"
26
- MODEL_DIR = BASE_DIR / "model"
27
- LOGS_DIR = BASE_DIR / "logs"
28
- RESULTS_DIR = BASE_DIR / "results"
29
-
30
- # Model files - CONSISTENT PATHS
31
- MODEL_FILE = MODEL_DIR / "model.pkl"
32
- VECTORIZER_FILE = MODEL_DIR / "vectorizer.pkl"
33
- PIPELINE_FILE = MODEL_DIR / "pipeline.pkl"
34
- METADATA_FILE = BASE_DIR / "metadata.json"
35
-
36
- # Data files
37
- COMBINED_DATASET = DATA_DIR / "combined_dataset.csv"
38
- KAGGLE_FAKE_DATA = DATA_DIR / "kaggle" / "Fake.csv"
39
- KAGGLE_TRUE_DATA = DATA_DIR / "kaggle" / "True.csv"
40
-
41
- # Log files
42
- ACTIVITY_LOG = BASE_DIR / "activity_log.json"
43
- MONITORING_LOG = LOGS_DIR / "monitoring_log.json"
44
-
45
- @classmethod
46
- def ensure_directories(cls):
47
- """Create all required directories with proper permissions"""
48
- directories = [cls.DATA_DIR, cls.MODEL_DIR, cls.LOGS_DIR, cls.RESULTS_DIR]
49
-
50
- for directory in directories:
51
- try:
52
- directory.mkdir(parents=True, exist_ok=True, mode=0o755)
53
- print(f"Directory ensured: {directory}")
54
- except Exception as e:
55
- print(f"Error creating {directory}: {e}")
56
-
57
-
58
  def log_step(message):
59
  """Log initialization steps"""
60
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
61
 
62
 
63
  def create_directories():
64
- """Create necessary directories using centralized paths"""
65
  log_step("Creating directory structure...")
66
 
67
- try:
68
- PathConfig.ensure_directories()
69
-
70
- # Create kaggle subdirectory
71
- kaggle_dir = PathConfig.DATA_DIR / "kaggle"
72
- kaggle_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
73
-
74
- log_step(f"Created {PathConfig.DATA_DIR}")
75
- log_step(f"Created {PathConfig.MODEL_DIR}")
76
- log_step(f"Created {PathConfig.LOGS_DIR}")
77
- log_step(f"Created {kaggle_dir}")
78
-
79
- return True
80
-
81
- except Exception as e:
82
- log_step(f"Directory Creation failed: {e}")
83
- return False
84
 
85
 
86
  def copy_original_datasets():
87
- """Copy original datasets using centralized paths"""
88
  log_step("Copying original datasets...")
89
 
90
  source_files = [
91
- ("/app/data/kaggle/Fake.csv", PathConfig.KAGGLE_FAKE_DATA),
92
- ("/app/data/kaggle/True.csv", PathConfig.KAGGLE_TRUE_DATA),
93
- ("/app/data/combined_dataset.csv", PathConfig.COMBINED_DATASET)
94
  ]
95
 
96
  copied_count = 0
97
  for source, dest in source_files:
98
- try:
99
- if Path(source).exists():
100
- dest.parent.mkdir(parents=True, exist_ok=True)
101
- shutil.copy(source, dest)
102
- log_step(f"Copied {source} to {dest}")
103
- copied_count += 1
104
- else:
105
- log_step(f"Source file not found: {source}")
106
- except Exception as e:
107
- log_step(f"Failed to copy {source}: {e}")
108
 
109
- if copied_count > 0:
110
- return True
111
- else:
112
- log_step("No files copied, but not considered failure")
113
- return True
114
 
115
 
116
  def create_minimal_dataset():
117
  """Create a minimal dataset if original doesn't exist"""
118
  log_step("Creating minimal dataset...")
119
 
120
- combined_path = PathConfig.COMBINED_DATASET
121
 
122
  if combined_path.exists():
123
- log_step("Combined dataset already exists")
124
  return True
125
 
126
- try:
127
- # Ensure data directory exists
128
- combined_path.parent.mkdir(parents=True, exist_ok=True)
129
-
130
- # Create minimal training data
131
- minimal_data = pd.DataFrame({
132
- 'text': [
133
- 'Scientists discover new species in Amazon rainforest',
134
- 'SHOCKING: Aliens spotted in Area 51, government confirms existence',
135
- 'Local authorities report increase in renewable energy adoption',
136
- 'You won\'t believe what happens when you eat this miracle fruit',
137
- 'Economic indicators show steady growth in manufacturing sector',
138
- 'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
139
- 'Research shows positive effects of meditation on mental health',
140
- 'Government hiding truth about flat earth, conspiracy theorists claim',
141
- 'New study reveals benefits of regular exercise for elderly',
142
- 'BREAKING: Time travel confirmed by underground scientists'
143
- ],
144
- 'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0=Real, 1=Fake
145
- })
146
-
147
- minimal_data.to_csv(combined_path, index=False)
148
- log_step(f"Created minimal dataset with {len(minimal_data)} samples")
149
- return True
150
-
151
- except Exception as e:
152
- log_step(f"Failed to create minimal dataset: {e}")
153
- return False
154
 
155
 
156
  def run_initial_training():
157
- """Run basic model training using centralized paths"""
158
  log_step("Starting initial model training...")
159
 
160
  try:
161
- # Check if model already exists - FIXED PATHS
162
- if PathConfig.PIPELINE_FILE.exists():
163
- log_step("Model files already exist")
 
 
 
164
  return True
165
 
166
  # Import required libraries
167
  from sklearn.feature_extraction.text import TfidfVectorizer
168
  from sklearn.linear_model import LogisticRegression
169
  from sklearn.model_selection import train_test_split
170
- from sklearn.pipeline import Pipeline
171
  from sklearn.metrics import accuracy_score
172
  import joblib
173
 
174
- # Load dataset - FIXED PATH
175
- if not PathConfig.COMBINED_DATASET.exists():
176
- log_step("No dataset available for training")
 
177
  return False
178
 
179
- df = pd.read_csv(PathConfig.COMBINED_DATASET)
180
  log_step(f"Loaded dataset with {len(df)} samples")
181
 
182
  # Prepare data
@@ -188,126 +120,78 @@ def run_initial_training():
188
  X, y, test_size=0.2, random_state=42, stratify=y
189
  )
190
 
191
- # Create pipeline
192
- pipeline = Pipeline([
193
- ('vectorize', TfidfVectorizer(
194
- max_features=5000,
195
- stop_words='english',
196
- ngram_range=(1, 2)
197
- )),
198
- ('model', LogisticRegression(max_iter=1000, random_state=42))
199
- ])
200
 
201
- # Train pipeline
202
- pipeline.fit(X_train, y_train)
 
203
 
204
  # Evaluate
205
- y_pred = pipeline.predict(X_test)
206
  accuracy = accuracy_score(y_test, y_pred)
207
 
208
- # Ensure model directory exists
209
- PathConfig.MODEL_DIR.mkdir(parents=True, exist_ok=True)
 
210
 
211
- # Save complete pipeline - FIXED PATH
212
- joblib.dump(pipeline, PathConfig.PIPELINE_FILE)
213
-
214
- # Save individual components for backward compatibility - FIXED PATHS
215
- joblib.dump(pipeline.named_steps['model'], PathConfig.MODEL_FILE)
216
- joblib.dump(pipeline.named_steps['vectorize'], PathConfig.VECTORIZER_FILE)
217
-
218
- # Save metadata - FIXED PATH
219
  metadata = {
220
  "model_version": "v1.0_init",
221
  "test_accuracy": float(accuracy),
222
  "train_size": len(X_train),
223
  "test_size": len(X_test),
224
  "timestamp": datetime.now().isoformat(),
225
- "training_method": "initialization",
226
- "model_type": "logistic_regression",
227
- "environment": PathConfig.ENVIRONMENT,
228
- "paths": {
229
- "pipeline_file": str(PathConfig.PIPELINE_FILE),
230
- "model_file": str(PathConfig.MODEL_FILE),
231
- "vectorizer_file": str(PathConfig.VECTORIZER_FILE)
232
- }
233
  }
234
 
235
- with open(PathConfig.METADATA_FILE, 'w') as f:
236
  json.dump(metadata, f, indent=2)
237
 
238
- log_step(f"Training completed successfully, accuracy: {accuracy:.4f}")
239
- log_step(f"Pipeline saved to: {PathConfig.PIPELINE_FILE}")
240
- log_step(f"Metadata saved to: {PathConfig.METADATA_FILE}")
241
  return True
242
 
243
  except Exception as e:
244
- log_step(f"Training failed: {str(e)}")
245
- import traceback
246
- log_step(f"Full traceback: {traceback.format_exc()}")
247
  return False
248
 
249
 
250
  def create_initial_logs():
251
- """Create initial log files using centralized paths"""
252
  log_step("Creating initial log files...")
253
 
254
  try:
255
- # Ensure logs directory exists
256
- PathConfig.LOGS_DIR.mkdir(parents=True, exist_ok=True)
257
-
258
- # Activity log - FIXED PATH
259
  activity_log = [{
260
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
261
- "event": "System initialized successfully",
262
- "environment": PathConfig.ENVIRONMENT,
263
- "base_directory": str(PathConfig.BASE_DIR)
264
  }]
265
 
266
- with open(PathConfig.ACTIVITY_LOG, 'w') as f:
267
  json.dump(activity_log, f, indent=2)
268
 
269
- # Create empty monitoring logs - FIXED PATH
270
- with open(PathConfig.MONITORING_LOG, 'w') as f:
271
  json.dump([], f)
272
 
273
- log_step("Initial log files created")
274
- log_step(f"Activity log: {PathConfig.ACTIVITY_LOG}")
275
- log_step(f"Monitoring log: {PathConfig.MONITORING_LOG}")
276
  return True
277
 
278
  except Exception as e:
279
- log_step(f"Log creation failed: {str(e)}")
280
  return False
281
 
282
 
283
- def verify_initialization():
284
- """Verify that initialization was successful"""
285
- log_step("Verifying initialization...")
286
-
287
- required_files = [
288
- PathConfig.PIPELINE_FILE,
289
- PathConfig.METADATA_FILE,
290
- PathConfig.COMBINED_DATASET
291
- ]
292
-
293
- missing_files = []
294
- for file_path in required_files:
295
- if not file_path.exists():
296
- missing_files.append(str(file_path))
297
-
298
- if missing_files:
299
- log_step(f"Missing required files: {missing_files}")
300
- return False
301
- else:
302
- log_step("All required files present")
303
- return True
304
-
305
-
306
  def main():
307
  """Main initialization function"""
308
- log_step("Starting system initialization...")
309
- log_step(f"Environment: {PathConfig.ENVIRONMENT}")
310
- log_step(f"Base directory: {PathConfig.BASE_DIR}")
311
 
312
  steps = [
313
  ("Directory Creation", create_directories),
@@ -322,30 +206,23 @@ def main():
322
  for step_name, step_function in steps:
323
  try:
324
  if step_function():
325
- log_step(f"{step_name} completed")
326
  else:
327
- log_step(f"{step_name} failed")
328
  failed_steps.append(step_name)
329
  except Exception as e:
330
- log_step(f"{step_name} failed: {str(e)}")
331
  failed_steps.append(step_name)
332
 
333
- # Final verification
334
- if not failed_steps:
335
- if verify_initialization():
336
- log_step("System initialization completed successfully!")
337
- else:
338
- log_step("Initialization verification failed")
339
- failed_steps.append("Verification")
340
-
341
  if failed_steps:
342
- log_step(f"Initialization completed with {len(failed_steps)} failed steps")
 
343
  log_step(f"Failed: {', '.join(failed_steps)}")
344
  else:
345
- log_step("System initialization completed successfully!")
346
 
347
  log_step("System ready for use!")
348
 
349
 
350
  if __name__ == "__main__":
351
- main()
 
7
  from datetime import datetime
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def log_step(message):
11
  """Log initialization steps"""
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
  def create_directories():
16
+ """Create necessary directories"""
17
  log_step("Creating directory structure...")
18
 
19
+ directories = [
20
+ "/tmp/data",
21
+ "/tmp/model",
22
+ "/tmp/logs"
23
+ ]
24
+
25
+ for dir_path in directories:
26
+ Path(dir_path).mkdir(parents=True, exist_ok=True)
27
+ log_step(f"βœ… Created {dir_path}")
 
 
 
 
 
 
 
 
28
 
29
 
30
  def copy_original_datasets():
31
+ """Copy original datasets from /app to /tmp"""
32
  log_step("Copying original datasets...")
33
 
34
  source_files = [
35
+ ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
36
+ ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
37
+ ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
38
  ]
39
 
40
  copied_count = 0
41
  for source, dest in source_files:
42
+ if Path(source).exists():
43
+ Path(dest).parent.mkdir(parents=True, exist_ok=True)
44
+ shutil.copy(source, dest)
45
+ log_step(f"βœ… Copied {source} to {dest}")
46
+ copied_count += 1
47
+ else:
48
+ log_step(f"⚠️ Source file not found: {source}")
 
 
 
49
 
50
+ return copied_count > 0
 
 
 
 
51
 
52
 
53
  def create_minimal_dataset():
54
  """Create a minimal dataset if original doesn't exist"""
55
  log_step("Creating minimal dataset...")
56
 
57
+ combined_path = Path("/tmp/data/combined_dataset.csv")
58
 
59
  if combined_path.exists():
60
+ log_step("βœ… Combined dataset already exists")
61
  return True
62
 
63
+ # Create minimal training data
64
+ minimal_data = pd.DataFrame({
65
+ 'text': [
66
+ 'Scientists discover new species in Amazon rainforest',
67
+ 'SHOCKING: Aliens spotted in Area 51, government confirms existence',
68
+ 'Local authorities report increase in renewable energy adoption',
69
+ 'You won\'t believe what happens when you eat this miracle fruit',
70
+ 'Economic indicators show steady growth in manufacturing sector',
71
+ 'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
72
+ 'Research shows positive effects of meditation on mental health',
73
+ 'Government hiding truth about flat earth, conspiracy theorists claim',
74
+ 'New study reveals benefits of regular exercise for elderly',
75
+ 'BREAKING: Time travel confirmed by underground scientists'
76
+ ],
77
+ 'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0=Real, 1=Fake
78
+ })
79
+
80
+ minimal_data.to_csv(combined_path, index=False)
81
+ log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples")
82
+ return True
 
 
 
 
 
 
 
 
83
 
84
 
85
  def run_initial_training():
86
+ """Run basic model training"""
87
  log_step("Starting initial model training...")
88
 
89
  try:
90
+ # Check if model already exists
91
+ model_path = Path("/tmp/model.pkl")
92
+ vectorizer_path = Path("/tmp/vectorizer.pkl")
93
+
94
+ if model_path.exists() and vectorizer_path.exists():
95
+ log_step("βœ… Model files already exist")
96
  return True
97
 
98
  # Import required libraries
99
  from sklearn.feature_extraction.text import TfidfVectorizer
100
  from sklearn.linear_model import LogisticRegression
101
  from sklearn.model_selection import train_test_split
 
102
  from sklearn.metrics import accuracy_score
103
  import joblib
104
 
105
+ # Load dataset
106
+ dataset_path = Path("/tmp/data/combined_dataset.csv")
107
+ if not dataset_path.exists():
108
+ log_step("❌ No dataset available for training")
109
  return False
110
 
111
+ df = pd.read_csv(dataset_path)
112
  log_step(f"Loaded dataset with {len(df)} samples")
113
 
114
  # Prepare data
 
120
  X, y, test_size=0.2, random_state=42, stratify=y
121
  )
122
 
123
+ # Vectorization
124
+ vectorizer = TfidfVectorizer(
125
+ max_features=5000,
126
+ stop_words='english',
127
+ ngram_range=(1, 2)
128
+ )
129
+ X_train_vec = vectorizer.fit_transform(X_train)
130
+ X_test_vec = vectorizer.transform(X_test)
 
131
 
132
+ # Train model
133
+ model = LogisticRegression(max_iter=1000, random_state=42)
134
+ model.fit(X_train_vec, y_train)
135
 
136
  # Evaluate
137
+ y_pred = model.predict(X_test_vec)
138
  accuracy = accuracy_score(y_test, y_pred)
139
 
140
+ # Save model
141
+ joblib.dump(model, "/tmp/model.pkl")
142
+ joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
143
 
144
+ # Save metadata
 
 
 
 
 
 
 
145
  metadata = {
146
  "model_version": "v1.0_init",
147
  "test_accuracy": float(accuracy),
148
  "train_size": len(X_train),
149
  "test_size": len(X_test),
150
  "timestamp": datetime.now().isoformat(),
151
+ "training_method": "initialization"
 
 
 
 
 
 
 
152
  }
153
 
154
+ with open("/tmp/metadata.json", 'w') as f:
155
  json.dump(metadata, f, indent=2)
156
 
157
+ log_step(
158
+ f"βœ… Training completed successfully, accuracy: {accuracy:.4f}")
 
159
  return True
160
 
161
  except Exception as e:
162
+ log_step(f"❌ Training failed: {str(e)}")
 
 
163
  return False
164
 
165
 
166
  def create_initial_logs():
167
+ """Create initial log files"""
168
  log_step("Creating initial log files...")
169
 
170
  try:
171
+ # Activity log
 
 
 
172
  activity_log = [{
173
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
174
+ "event": "System initialized successfully"
 
 
175
  }]
176
 
177
+ with open("/tmp/activity_log.json", 'w') as f:
178
  json.dump(activity_log, f, indent=2)
179
 
180
+ # Create empty monitoring logs
181
+ with open("/tmp/logs/monitoring_log.json", 'w') as f:
182
  json.dump([], f)
183
 
184
+ log_step("βœ… Initial log files created")
 
 
185
  return True
186
 
187
  except Exception as e:
188
+ log_step(f"❌ Log creation failed: {str(e)}")
189
  return False
190
 
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  def main():
193
  """Main initialization function"""
194
+ log_step("πŸš€ Starting system initialization...")
 
 
195
 
196
  steps = [
197
  ("Directory Creation", create_directories),
 
206
  for step_name, step_function in steps:
207
  try:
208
  if step_function():
209
+ log_step(f"βœ… {step_name} completed")
210
  else:
211
+ log_step(f"❌ {step_name} failed")
212
  failed_steps.append(step_name)
213
  except Exception as e:
214
+ log_step(f"❌ {step_name} failed: {str(e)}")
215
  failed_steps.append(step_name)
216
 
 
 
 
 
 
 
 
 
217
  if failed_steps:
218
+ log_step(
219
+ f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
220
  log_step(f"Failed: {', '.join(failed_steps)}")
221
  else:
222
+ log_step("πŸŽ‰ System initialization completed successfully!")
223
 
224
  log_step("System ready for use!")
225
 
226
 
227
  if __name__ == "__main__":
228
+ main()