Ahmedik95316 commited on
Commit
cc910a7
Β·
1 Parent(s): 57fc42f

Create initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +218 -0
initialize_system.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import shutil
4
+ import pandas as pd
5
+ import json
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+
9
+ def log_step(message):
10
+ """Log initialization steps"""
11
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
12
+
13
+ def create_directories():
14
+ """Create necessary directories"""
15
+ log_step("Creating directory structure...")
16
+
17
+ directories = [
18
+ "/tmp/data",
19
+ "/tmp/model",
20
+ "/tmp/logs"
21
+ ]
22
+
23
+ for dir_path in directories:
24
+ Path(dir_path).mkdir(parents=True, exist_ok=True)
25
+ log_step(f"βœ… Created {dir_path}")
26
+
27
+ def copy_original_datasets():
28
+ """Copy original datasets from /app to /tmp"""
29
+ log_step("Copying original datasets...")
30
+
31
+ source_files = [
32
+ ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
33
+ ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
34
+ ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
35
+ ]
36
+
37
+ copied_count = 0
38
+ for source, dest in source_files:
39
+ if Path(source).exists():
40
+ Path(dest).parent.mkdir(parents=True, exist_ok=True)
41
+ shutil.copy(source, dest)
42
+ log_step(f"βœ… Copied {source} to {dest}")
43
+ copied_count += 1
44
+ else:
45
+ log_step(f"⚠️ Source file not found: {source}")
46
+
47
+ return copied_count > 0
48
+
49
+ def create_minimal_dataset():
50
+ """Create a minimal dataset if original doesn't exist"""
51
+ log_step("Creating minimal dataset...")
52
+
53
+ combined_path = Path("/tmp/data/combined_dataset.csv")
54
+
55
+ if combined_path.exists():
56
+ log_step("βœ… Combined dataset already exists")
57
+ return True
58
+
59
+ # Create minimal training data
60
+ minimal_data = pd.DataFrame({
61
+ 'text': [
62
+ 'Scientists discover new species in Amazon rainforest',
63
+ 'SHOCKING: Aliens spotted in Area 51, government confirms existence',
64
+ 'Local authorities report increase in renewable energy adoption',
65
+ 'You won\'t believe what happens when you eat this miracle fruit',
66
+ 'Economic indicators show steady growth in manufacturing sector',
67
+ 'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
68
+ 'Research shows positive effects of meditation on mental health',
69
+ 'Government hiding truth about flat earth, conspiracy theorists claim',
70
+ 'New study reveals benefits of regular exercise for elderly',
71
+ 'BREAKING: Time travel confirmed by underground scientists'
72
+ ],
73
+ 'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0=Real, 1=Fake
74
+ })
75
+
76
+ minimal_data.to_csv(combined_path, index=False)
77
+ log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples")
78
+ return True
79
+
80
+ def run_initial_training():
81
+ """Run basic model training"""
82
+ log_step("Starting initial model training...")
83
+
84
+ try:
85
+ # Check if model already exists
86
+ model_path = Path("/tmp/model.pkl")
87
+ vectorizer_path = Path("/tmp/vectorizer.pkl")
88
+
89
+ if model_path.exists() and vectorizer_path.exists():
90
+ log_step("βœ… Model files already exist")
91
+ return True
92
+
93
+ # Import required libraries
94
+ from sklearn.feature_extraction.text import TfidfVectorizer
95
+ from sklearn.linear_model import LogisticRegression
96
+ from sklearn.model_selection import train_test_split
97
+ from sklearn.metrics import accuracy_score
98
+ import joblib
99
+
100
+ # Load dataset
101
+ dataset_path = Path("/tmp/data/combined_dataset.csv")
102
+ if not dataset_path.exists():
103
+ log_step("❌ No dataset available for training")
104
+ return False
105
+
106
+ df = pd.read_csv(dataset_path)
107
+ log_step(f"Loaded dataset with {len(df)} samples")
108
+
109
+ # Prepare data
110
+ X = df['text'].values
111
+ y = df['label'].values
112
+
113
+ # Train-test split
114
+ X_train, X_test, y_train, y_test = train_test_split(
115
+ X, y, test_size=0.2, random_state=42, stratify=y
116
+ )
117
+
118
+ # Vectorization
119
+ vectorizer = TfidfVectorizer(
120
+ max_features=5000,
121
+ stop_words='english',
122
+ ngram_range=(1, 2)
123
+ )
124
+ X_train_vec = vectorizer.fit_transform(X_train)
125
+ X_test_vec = vectorizer.transform(X_test)
126
+
127
+ # Train model
128
+ model = LogisticRegression(max_iter=1000, random_state=42)
129
+ model.fit(X_train_vec, y_train)
130
+
131
+ # Evaluate
132
+ y_pred = model.predict(X_test_vec)
133
+ accuracy = accuracy_score(y_test, y_pred)
134
+
135
+ # Save model
136
+ joblib.dump(model, "/tmp/model.pkl")
137
+ joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
138
+
139
+ # Save metadata
140
+ metadata = {
141
+ "model_version": "v1.0_init",
142
+ "test_accuracy": float(accuracy),
143
+ "train_size": len(X_train),
144
+ "test_size": len(X_test),
145
+ "timestamp": datetime.now().isoformat(),
146
+ "training_method": "initialization"
147
+ }
148
+
149
+ with open("/tmp/metadata.json", 'w') as f:
150
+ json.dump(metadata, f, indent=2)
151
+
152
+ log_step(f"βœ… Training completed successfully, accuracy: {accuracy:.4f}")
153
+ return True
154
+
155
+ except Exception as e:
156
+ log_step(f"❌ Training failed: {str(e)}")
157
+ return False
158
+
159
+ def create_initial_logs():
160
+ """Create initial log files"""
161
+ log_step("Creating initial log files...")
162
+
163
+ try:
164
+ # Activity log
165
+ activity_log = [{
166
+ "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
167
+ "event": "System initialized successfully"
168
+ }]
169
+
170
+ with open("/tmp/activity_log.json", 'w') as f:
171
+ json.dump(activity_log, f, indent=2)
172
+
173
+ # Create empty monitoring logs
174
+ with open("/tmp/logs/monitoring_log.json", 'w') as f:
175
+ json.dump([], f)
176
+
177
+ log_step("βœ… Initial log files created")
178
+ return True
179
+
180
+ except Exception as e:
181
+ log_step(f"❌ Log creation failed: {str(e)}")
182
+ return False
183
+
184
+ def main():
185
+ """Main initialization function"""
186
+ log_step("πŸš€ Starting system initialization...")
187
+
188
+ steps = [
189
+ ("Directory Creation", create_directories),
190
+ ("Dataset Copy", copy_original_datasets),
191
+ ("Minimal Dataset", create_minimal_dataset),
192
+ ("Model Training", run_initial_training),
193
+ ("Log Creation", create_initial_logs)
194
+ ]
195
+
196
+ failed_steps = []
197
+
198
+ for step_name, step_function in steps:
199
+ try:
200
+ if step_function():
201
+ log_step(f"βœ… {step_name} completed")
202
+ else:
203
+ log_step(f"❌ {step_name} failed")
204
+ failed_steps.append(step_name)
205
+ except Exception as e:
206
+ log_step(f"❌ {step_name} failed: {str(e)}")
207
+ failed_steps.append(step_name)
208
+
209
+ if failed_steps:
210
+ log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
211
+ log_step(f"Failed: {', '.join(failed_steps)}")
212
+ else:
213
+ log_step("πŸŽ‰ System initialization completed successfully!")
214
+
215
+ log_step("System ready for use!")
216
+
217
+ if __name__ == "__main__":
218
+ main()