Predicting Crimes in Neighbourhood
In this notebook we try to classify the crimes in neighbourhood based on featues like time, day and area. We will try 3 models and compare them with each other.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from datetime import datetime,date
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings("ignore")
df = pd.read_csv('crimedata_csv_all_years.csv')
df.head()
df.isnull().sum()
df['Date'] = pd.to_datetime(df[['YEAR','MONTH','DAY','HOUR','MINUTE']])
df['WEEKDAY'] = df['Date'].dt.dayofweek
ca_holidays = holidays.Canada(prov='BC')
for i,row in df.iterrows():
if df.iloc[i]['Date'] in ca_holidays:
df.at[i,'HOLIDAY'] = 1
else:
df.at[i,'HOLIDAY'] = 0
df.head()
Severe_crimes = ['Vehicle Collision or Pedestrian Struck (with Fatality)',
'Homicide','Offence Against a Person','Vehicle Collision or Pedestrian Struck (with Injury)']
Theft = ['Theft from Vehicle','Other Theft','Theft of Vehicle','Theft of Bicycle']
for idx,row in df.iterrows():
if str(row['TYPE']) in Severe_crimes:
df.at[idx,'CRIME_TYPE'] = 'SEVERE'
elif str(row['TYPE']) in Theft:
df.at[idx,'CRIME_TYPE'] = 'Theft'
elif str(row['TYPE']) == 'Mischief':
df.at[idx,'CRIME_TYPE'] = 'Mischief'
else:
df.at[idx,'CRIME_TYPE'] = 'B&E'
df.head(5)
df.HOLIDAY.value_counts()
%matplotlib inline
sns.countplot(x='YEAR',data=df[['Y','X','YEAR']][df.X.isnull()])
This shows that most of the missing coordinate values are in year 2003 (almost half)
df.dropna(subset=['X','Y'],inplace=True)
df.head()
df.NEIGHBOURHOOD.value_counts(),len(df.NEIGHBOURHOOD.value_counts())
There are 24 Neighbourhood values
plt.figure(figsize=(20,20))
g =sns.scatterplot(x="X", y="Y",
hue="NEIGHBOURHOOD",
data=df,legend=False);
g.set(xscale="log");
There seems to be very high corelation between coordinate value and the neighbourhood and it makes sense because coordindies in a way represents location of neighbourhood. There we can use KNN to impute the missing values of neighbourhood in our data.
cols = ['X','Y']
df[cols][df['NEIGHBOURHOOD'].isnull()].isnull().sum()
Therefore no coordinate values are missing in case of missing neighbourhood innstances
df.NEIGHBOURHOOD.unique()
df.dropna(subset=['HUNDRED_BLOCK'],inplace=True)
train_df = df.dropna(subset=['NEIGHBOURHOOD'],inplace=False)
x_train = train_df[['X','Y']]
y_train = train_df['NEIGHBOURHOOD']
test_df=df[df['NEIGHBOURHOOD'].isnull()]
classifier = KNeighborsClassifier(n_neighbors=5,metric='haversine')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(test_df[['X','Y']])
test_df['NEIGHBOURHOOD'] = y_pred
new_df = df[0:0]
new_df=pd.concat([new_df,train_df],ignore_index=True)
new_df=pd.concat([new_df,test_df],ignore_index=True)
print(new_df.isnull().any().sum())
new_df['Date'] = pd.to_datetime(new_df[['YEAR','MONTH','DAY','HOUR','MINUTE']])
new_df.set_index('Date',inplace=True)
new_df.drop('TYPE',axis=1,inplace=True)
print(new_df.head())
new_df.isnull().sum()
new_df.sort_values(['Date'],axis=0,ascending=True,inplace=True)
# List of years where Crime_Type is to be predicted
year_predict = [2015,2016,2017,2018,2019]
# Initialize the Random Forest Classifier Model
rfc = RandomForestClassifier()
# Encode the Categorical Features
label_encoder = LabelEncoder()
new_df['NEIGHBOURHOOD'] = label_encoder.fit_transform(new_df['NEIGHBOURHOOD'])
# Create a different model for predicting crime for each year
for i in year_predict:
# Create training and testing dataset for each Iteration
mask = (new_df['YEAR'] >= 2003) & (new_df['YEAR'] < i)
train_x = new_df.loc[mask]
train_y = train_x.CRIME_TYPE
train_x = train_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
new_mask = (new_df['YEAR'] == i)
test_x = new_df.loc[new_mask]
test_y = test_x.CRIME_TYPE
test_x = test_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
print("Year: ",i)
rfc.fit(train_x.values,train_y.values)
y_pred = rfc.predict(test_x)
print("Accuracy: ",accuracy_score(test_y,y_pred))
model = XGBClassifier()
# Create a different model for predicting crime for each year
for i in year_predict:
# Create training and testing dataset for each Iteration
mask = (new_df['YEAR'] >= 2003) & (new_df['YEAR'] < i)
train_x = new_df.loc[mask]
train_y = train_x.CRIME_TYPE
train_x = train_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
new_mask = (new_df['YEAR'] == i)
test_x = new_df.loc[new_mask]
test_y = test_x.CRIME_TYPE
test_x = test_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
print("Year: ",i)
model.fit(train_x, train_y)
y_pred_xgb = model.predict(test_x)
print("Accuracy: ",accuracy_score(test_y,y_pred_xgb))