EDA on Vancouver City Crime dataset
Performing EDA on Vancouver City Crime dataset and plotting various graphs using seaborn and matplotlib.
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
# Read the dataset
df = pd.read_csv("crimedata_csv_all_years.csv")
# Create a Date column
df['DATE'] = pd.to_datetime(df[['YEAR','MONTH','DAY']])
# Create a new feature "Weekday"
df['WEEKDAY'] = df['DATE'].dt.dayofweek
print(df.head())
print(df['NEIGHBOURHOOD'].unique())
count_df = df['HOUR'].value_counts()
city_count = count_df[:,]
plt.figure(figsize=(12,5))
sns.barplot(count_df.index, count_df.values, alpha=1)
plt.title('Crime occurence at different time of the day')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Hour', fontsize=12)
plt.show()
for column in df:
print(df[column].isna().value_counts())
new_df = df.groupby('YEAR')
print(df['TYPE'].value_counts())
nameplot = df['NEIGHBOURHOOD'].value_counts().plot.bar(title='Count of each type of crime happened in Vancouver', figsize=(12,6))
nameplot.set_xlabel('category',size=20)
nameplot.set_ylabel('crime count',size=20)
count_df = df['WEEKDAY'].value_counts()
city_count = count_df[:,]
sns.barplot(count_df.index, count_df.values, alpha=1)
plt.title('Crime occurence on different days of the week')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Day of the week', fontsize=12)
plt.show()
# year labels
year_labels = sorted(df["YEAR"].unique())
# crime types
crime_types = sorted(df["TYPE"].unique().tolist())
crime_count_by_year = pd.DataFrame(columns =["year"])
crime_count_by_year["year"] = year_labels
crime_count_by_year
for current_type in crime_types:
current_crime = df[df["TYPE"]==current_type]
current_crime_counts = current_crime["YEAR"].value_counts(sort=False)
current_crime_index = current_crime_counts.index.tolist()
current_crime_index, current_crime_counts = zip(*sorted(zip(current_crime_index, current_crime_counts)))
crime_count_by_year[current_type] = current_crime_counts
crime_count_by_year
crime_types = sorted(df["TYPE"].unique().tolist())
fig = go.Figure()
for current_crime in crime_types:
current_type_count = crime_count_by_year[current_crime]
fig.add_trace(
go.Scatter(
x=year_labels,
y=current_type_count,
mode='lines+markers',
name=current_crime
)
)
fig.update_layout(title='Crimes Over the Years in Vancouver by Type',
xaxis_title='Year',
yaxis_title='Absolute Change',
autosize=True,
height=570
)
fig.update_layout(legend_orientation="h")
fig.show()
new_df = df.groupby('YEAR')
df2 = df['TYPE'].value_counts()
crime_types = df2.index
new_df = df.groupby('TYPE')
for value in crime_types:
df1 = new_df.get_group(value)
count_df = df1['YEAR'].value_counts()
plt.figure(figsize=(8.5,5))
sns.barplot(count_df.index, count_df.values, alpha=1)
plt.title(value)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.show()