import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from datetime import datetime,date
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings("ignore")
df = pd.read_csv('crimedata_csv_all_years.csv')
df.head()
TYPE YEAR MONTH DAY HOUR MINUTE HUNDRED_BLOCK NEIGHBOURHOOD X Y
0 Break and Enter Commercial 2012 12 14 8 52 NaN Oakridge 491285.000000 5.453433e+06
1 Break and Enter Commercial 2019 3 7 2 6 10XX SITKA SQ Fairview 490612.964805 5.457110e+06
2 Break and Enter Commercial 2019 8 27 4 12 10XX ALBERNI ST West End 491007.779775 5.459174e+06
3 Break and Enter Commercial 2014 8 8 5 13 10XX ALBERNI ST West End 491015.943352 5.459166e+06
4 Break and Enter Commercial 2005 11 14 3 9 10XX ALBERNI ST West End 491021.385727 5.459161e+06

Number of Missing data in each column

df.isnull().sum()
TYPE                 0
YEAR                 0
MONTH                0
DAY                  0
HOUR                 0
MINUTE               0
HUNDRED_BLOCK       13
NEIGHBOURHOOD    64574
X                  119
Y                  119
dtype: int64
df['Date'] = pd.to_datetime(df[['YEAR','MONTH','DAY','HOUR','MINUTE']])

df['WEEKDAY'] = df['Date'].dt.dayofweek

ca_holidays = holidays.Canada(prov='BC')
for i,row in df.iterrows():
    if df.iloc[i]['Date'] in ca_holidays:
        df.at[i,'HOLIDAY'] = 1
    else:
        df.at[i,'HOLIDAY'] = 0
df.head()
TYPE YEAR MONTH DAY HOUR MINUTE HUNDRED_BLOCK NEIGHBOURHOOD X Y Date WEEKDAY HOLIDAY
0 Break and Enter Commercial 2012 12 14 8 52 NaN Oakridge 491285.000000 5.453433e+06 2012-12-14 08:52:00 4 0.0
1 Break and Enter Commercial 2019 3 7 2 6 10XX SITKA SQ Fairview 490612.964805 5.457110e+06 2019-03-07 02:06:00 3 0.0
2 Break and Enter Commercial 2019 8 27 4 12 10XX ALBERNI ST West End 491007.779775 5.459174e+06 2019-08-27 04:12:00 1 0.0
3 Break and Enter Commercial 2014 8 8 5 13 10XX ALBERNI ST West End 491015.943352 5.459166e+06 2014-08-08 05:13:00 4 0.0
4 Break and Enter Commercial 2005 11 14 3 9 10XX ALBERNI ST West End 491021.385727 5.459161e+06 2005-11-14 03:09:00 0 0.0
Severe_crimes = ['Vehicle Collision or Pedestrian Struck (with Fatality)',
                'Homicide','Offence Against a Person','Vehicle Collision or Pedestrian Struck (with Injury)']
Theft = ['Theft from Vehicle','Other Theft','Theft of Vehicle','Theft of Bicycle']

for idx,row in df.iterrows():
    if str(row['TYPE']) in Severe_crimes:
        df.at[idx,'CRIME_TYPE'] = 'SEVERE'
    elif str(row['TYPE']) in Theft:
        df.at[idx,'CRIME_TYPE'] = 'Theft'
    elif str(row['TYPE']) == 'Mischief':
        df.at[idx,'CRIME_TYPE'] = 'Mischief'
    else:
        df.at[idx,'CRIME_TYPE'] = 'B&E'
df.head(5)
df.HOLIDAY.value_counts()
0.0    600443
1.0     19131
Name: HOLIDAY, dtype: int64
%matplotlib inline
sns.countplot(x='YEAR',data=df[['Y','X','YEAR']][df.X.isnull()])
<matplotlib.axes._subplots.AxesSubplot at 0x1e7f0fc8f48>

This shows that most of the missing coordinate values are in year 2003 (almost half)

Let's drop rows with missing cordinates

df.dropna(subset=['X','Y'],inplace=True)
df.head()
TYPE YEAR MONTH DAY HOUR MINUTE HUNDRED_BLOCK NEIGHBOURHOOD X Y Date WEEKDAY HOLIDAY CRIME_TYPE
0 Break and Enter Commercial 2012 12 14 8 52 NaN Oakridge 491285.000000 5.453433e+06 2012-12-14 08:52:00 4 0.0 B&E
1 Break and Enter Commercial 2019 3 7 2 6 10XX SITKA SQ Fairview 490612.964805 5.457110e+06 2019-03-07 02:06:00 3 0.0 B&E
2 Break and Enter Commercial 2019 8 27 4 12 10XX ALBERNI ST West End 491007.779775 5.459174e+06 2019-08-27 04:12:00 1 0.0 B&E
3 Break and Enter Commercial 2014 8 8 5 13 10XX ALBERNI ST West End 491015.943352 5.459166e+06 2014-08-08 05:13:00 4 0.0 B&E
4 Break and Enter Commercial 2005 11 14 3 9 10XX ALBERNI ST West End 491021.385727 5.459161e+06 2005-11-14 03:09:00 0 0.0 B&E

Now let us see different neighbourhood

df.NEIGHBOURHOOD.value_counts(),len(df.NEIGHBOURHOOD.value_counts())
(Central Business District    136148
 West End                      48389
 Fairview                      36501
 Mount Pleasant                36104
 Grandview-Woodland            31413
 Renfrew-Collingwood           30892
 Kitsilano                     30442
 Kensington-Cedar Cottage      28232
 Strathcona                    25567
 Hastings-Sunrise              21124
 Sunset                        19600
 Marpole                       15054
 Riley Park                    14550
 Victoria-Fraserview           12264
 Killarney                     11798
 Oakridge                       9221
 Dunbar-Southlands              8752
 Kerrisdale                     8428
 Arbutus Ridge                  6790
 West Point Grey                6719
 Shaughnessy                    6289
 South Cambie                   5990
 Stanley Park                   4163
 Musqueam                        570
 Name: NEIGHBOURHOOD, dtype: int64, 24)

There are 24 Neighbourhood values

Lets see relation between X,Y and Neighourhood

plt.figure(figsize=(20,20))
g =sns.scatterplot(x="X", y="Y",
              hue="NEIGHBOURHOOD",
              data=df,legend=False);
g.set(xscale="log");

There seems to be very high corelation between coordinate value and the neighbourhood and it makes sense because coordindies in a way represents location of neighbourhood. There we can use KNN to impute the missing values of neighbourhood in our data.

Checking if cordinates are missing in missing neighbourhood instances

cols = ['X','Y']
df[cols][df['NEIGHBOURHOOD'].isnull()].isnull().sum()
X    0
Y    0
dtype: int64

Therefore no coordinate values are missing in case of missing neighbourhood innstances

df.NEIGHBOURHOOD.unique()
array(['Oakridge', 'Fairview', 'West End', 'Central Business District',
       'Hastings-Sunrise', 'Strathcona', 'Grandview-Woodland',
       'Kitsilano', 'Kensington-Cedar Cottage', 'Sunset',
       'Mount Pleasant', 'Stanley Park', 'Shaughnessy', 'Marpole',
       'West Point Grey', 'Victoria-Fraserview', 'Kerrisdale',
       'Riley Park', 'Arbutus Ridge', 'Renfrew-Collingwood', 'Killarney',
       'Dunbar-Southlands', 'South Cambie', 'Musqueam', nan], dtype=object)
df.dropna(subset=['HUNDRED_BLOCK'],inplace=True)
train_df = df.dropna(subset=['NEIGHBOURHOOD'],inplace=False)
x_train = train_df[['X','Y']]
y_train = train_df['NEIGHBOURHOOD']
test_df=df[df['NEIGHBOURHOOD'].isnull()]

classifier = KNeighborsClassifier(n_neighbors=5,metric='haversine')

classifier.fit(x_train, y_train)
y_pred = classifier.predict(test_df[['X','Y']])
test_df['NEIGHBOURHOOD'] = y_pred
new_df = df[0:0]
new_df=pd.concat([new_df,train_df],ignore_index=True)

new_df=pd.concat([new_df,test_df],ignore_index=True)

print(new_df.isnull().any().sum())
new_df['Date'] = pd.to_datetime(new_df[['YEAR','MONTH','DAY','HOUR','MINUTE']])
new_df.set_index('Date',inplace=True)
new_df.drop('TYPE',axis=1,inplace=True)
print(new_df.head())
0
                     YEAR  MONTH  DAY  HOUR  MINUTE    HUNDRED_BLOCK  \
Date                                                                   
2019-03-07 02:06:00  2019      3    7     2       6    10XX SITKA SQ   
2019-08-27 04:12:00  2019      8   27     4      12  10XX ALBERNI ST   
2014-08-08 05:13:00  2014      8    8     5      13  10XX ALBERNI ST   
2005-11-14 03:09:00  2005     11   14     3       9  10XX ALBERNI ST   
2006-05-21 04:50:00  2006      5   21     4      50  10XX ALBERNI ST   

                    NEIGHBOURHOOD              X             Y  WEEKDAY  \
Date                                                                      
2019-03-07 02:06:00      Fairview  490612.964805  5.457110e+06        3   
2019-08-27 04:12:00      West End  491007.779775  5.459174e+06        1   
2014-08-08 05:13:00      West End  491015.943352  5.459166e+06        4   
2005-11-14 03:09:00      West End  491021.385727  5.459161e+06        0   
2006-05-21 04:50:00      West End  491021.385727  5.459161e+06        6   

                     HOLIDAY CRIME_TYPE  
Date                                     
2019-03-07 02:06:00      0.0        B&E  
2019-08-27 04:12:00      0.0        B&E  
2014-08-08 05:13:00      0.0        B&E  
2005-11-14 03:09:00      0.0        B&E  
2006-05-21 04:50:00      0.0        B&E  
new_df.isnull().sum()
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    0
X                0
Y                0
WEEKDAY          0
HOLIDAY          0
CRIME_TYPE       0
dtype: int64
new_df.sort_values(['Date'],axis=0,ascending=True,inplace=True)

# List of years where Crime_Type is to be predicted

year_predict = [2015,2016,2017,2018,2019]

# Initialize the Random Forest Classifier Model

rfc = RandomForestClassifier()



# Encode the Categorical Features

label_encoder = LabelEncoder()

new_df['NEIGHBOURHOOD'] = label_encoder.fit_transform(new_df['NEIGHBOURHOOD'])

# Create a different model for predicting crime for each year

for i in year_predict:
    
# Create training and testing dataset for each Iteration

    mask = (new_df['YEAR'] >= 2003) & (new_df['YEAR'] < i)
    train_x = new_df.loc[mask]
    train_y = train_x.CRIME_TYPE
    train_x = train_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
    
    
    new_mask = (new_df['YEAR'] == i)
    test_x = new_df.loc[new_mask]    
    test_y = test_x.CRIME_TYPE
    test_x = test_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
    
    print("Year: ",i)
    rfc.fit(train_x.values,train_y.values)
    y_pred = rfc.predict(test_x)
    print("Accuracy: ",accuracy_score(test_y,y_pred))
Year:  2015
Accuracy:  0.5991150957675963
Year:  2016
Accuracy:  0.6260435379900666
Year:  2017
Accuracy:  0.6284370184118745
Year:  2018
Accuracy:  0.6302092907224075
Year:  2019
Accuracy:  0.6614870243591785
model = XGBClassifier()

# Create a different model for predicting crime for each year

for i in year_predict:
    
# Create training and testing dataset for each Iteration

    mask = (new_df['YEAR'] >= 2003) & (new_df['YEAR'] < i)
    train_x = new_df.loc[mask]
    train_y = train_x.CRIME_TYPE
    train_x = train_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
    
    
    new_mask = (new_df['YEAR'] == i)
    test_x = new_df.loc[new_mask]    
    test_y = test_x.CRIME_TYPE
    test_x = test_x[[ 'YEAR', 'MONTH', 'DAY', 'HOUR','WEEKDAY', 'NEIGHBOURHOOD','HOLIDAY']]
    
    print("Year: ",i)
    model.fit(train_x, train_y)

    y_pred_xgb = model.predict(test_x)
    print("Accuracy: ",accuracy_score(test_y,y_pred_xgb))
    
Year:  2015
Accuracy:  0.6681318041567212
Year:  2016
Accuracy:  0.6848251083166015
Year:  2017
Accuracy:  0.6859166734257982
Year:  2018
Accuracy:  0.6932853654053203
Year:  2019
Accuracy:  0.7254258875975164