| | import pandas as pd |
| | import numpy as np |
| | import joblib |
| | import os |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.naive_bayes import GaussianNB |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report |
| | from sklearn.preprocessing import LabelEncoder |
| | import xgboost as xgb |
| | from data_loader import load_data |
| | from preprocessing import preprocess_pipeline |
| |
|
| | def train_and_evaluate(): |
| | |
| | data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset') |
| | train_df, _ = load_data(data_dir) |
| | |
| | |
| | print("Preprocessing data...") |
| | |
| | df, kmeans_model = preprocess_pipeline(train_df, is_train=True, kmeans_model=None) |
| | |
| | |
| | features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season'] |
| | target = 'IsViolent' |
| | |
| | |
| | print("Encoding categorical features...") |
| | le_dict = {} |
| | for col in ['PdDistrict', 'Season']: |
| | le = LabelEncoder() |
| | df[col] = le.fit_transform(df[col]) |
| | le_dict[col] = le |
| | |
| | X = df[features] |
| | y = df[target] |
| | |
| | |
| | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) |
| | |
| | models = { |
| | 'Naive Bayes': GaussianNB(), |
| | 'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1), |
| | 'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) |
| | } |
| | |
| | best_model = None |
| | best_score = 0 |
| | results = {} |
| | |
| | print("Training models...") |
| | for name, model in models.items(): |
| | print(f"Training {name}...") |
| | model.fit(X_train, y_train) |
| | y_pred = model.predict(X_val) |
| | |
| | acc = accuracy_score(y_val, y_pred) |
| | prec = precision_score(y_val, y_pred) |
| | rec = recall_score(y_val, y_pred) |
| | |
| | results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec} |
| | print(f"{name} - Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}") |
| | |
| | if acc > best_score: |
| | best_score = acc |
| | best_model = model |
| | |
| | |
| | models_dir = os.path.join(os.path.dirname(__file__), '../models') |
| | os.makedirs(models_dir, exist_ok=True) |
| | |
| | print(f"Saving best model: {best_model.__class__.__name__}") |
| | joblib.dump(best_model, os.path.join(models_dir, 'best_model.pkl')) |
| | joblib.dump(le_dict, os.path.join(models_dir, 'label_encoders.pkl')) |
| | joblib.dump(kmeans_model, os.path.join(models_dir, 'kmeans.pkl')) |
| | |
| | return results |
| |
|
| | if __name__ == "__main__": |
| | train_and_evaluate() |
| |
|