| | import pandas as pd |
| | import numpy as np |
| | from sklearn.cluster import KMeans |
| | from pandas.tseries.holiday import USFederalHolidayCalendar as calendar |
| |
|
| | def define_target(df): |
| | """ |
| | Creates the target variable 'IsViolent' based on crime category. |
| | """ |
| | violent_categories = [ |
| | 'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON' |
| | ] |
| | |
| | df['IsViolent'] = df['Category'].apply(lambda x: 1 if x in violent_categories else 0) |
| | return df |
| |
|
| | def extract_temporal_features(df): |
| | """ |
| | Extracts temporal features from the 'Dates' column. |
| | """ |
| | df['Hour'] = df['Dates'].dt.hour |
| | df['Day'] = df['Dates'].dt.day |
| | df['Month'] = df['Dates'].dt.month |
| | df['Year'] = df['Dates'].dt.year |
| | df['DayOfWeek'] = df['Dates'].dt.dayofweek |
| | |
| | df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0) |
| | |
| | |
| | cal = calendar() |
| | holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max()) |
| | df['IsHoliday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays).astype(int) |
| | |
| | return df |
| |
|
| | def get_season(month): |
| | if month in [12, 1, 2]: |
| | return 'Winter' |
| | elif month in [3, 4, 5]: |
| | return 'Spring' |
| | elif month in [6, 7, 8]: |
| | return 'Summer' |
| | else: |
| | return 'Fall' |
| |
|
| | def extract_contextual_features(df): |
| | """ |
| | Extracts contextual features like Season. |
| | """ |
| | df['Season'] = df['Month'].apply(get_season) |
| | return df |
| |
|
| | def extract_location_features(df, n_clusters=10, kmeans_model=None): |
| | """ |
| | Extracts location features including K-Means clusters for high-crime zones. |
| | """ |
| | if kmeans_model is None: |
| | |
| | kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) |
| | df['LocationCluster'] = kmeans.fit_predict(df[['X', 'Y']]) |
| | return df, kmeans |
| | else: |
| | |
| | df['LocationCluster'] = kmeans_model.predict(df[['X', 'Y']]) |
| | return df, kmeans_model |
| |
|
| | def preprocess_pipeline(df, is_train=True, kmeans_model=None): |
| | """ |
| | Runs the full preprocessing pipeline. |
| | """ |
| | df = extract_temporal_features(df) |
| | df = extract_contextual_features(df) |
| | |
| | |
| | df, kmeans_model = extract_location_features(df, kmeans_model=kmeans_model) |
| | |
| | if is_train: |
| | df = define_target(df) |
| | |
| | return df, kmeans_model |
| |
|
| | if __name__ == "__main__": |
| | |
| | pass |
| |
|