library(tidyverse) #for data organization
library(lubridate) #for date manipulation
library(scales) #for date display
library(ggplot2) #for graphics
library(leaflet) #for maps
For this example, we will be examining vandalism in Boston, Massachusetts, USA and will create a map highlighting incidents of vandalism. Our data were sourced from Kaggle.com.
Data: Crimes in Boston
Source: https://www.kaggle.com/ankkur13/boston-crime-data/downloads/boston-crime-data.zip/3
Our first step will be to import the data and do an initial observation of our variables.
#read in data set
#remember to setwd() so your code can find the data
crimeData <- as_tibble(read.csv("crime.csv"))
#preview data
head(crimeData)
#get the column names
colnames(crimeData)
## [1] "INCIDENT_NUMBER" "OFFENSE_CODE" "OFFENSE_CODE_GROUP"
## [4] "OFFENSE_DESCRIPTION" "DISTRICT" "REPORTING_AREA"
## [7] "SHOOTING" "OCCURRED_ON_DATE" "YEAR"
## [10] "MONTH" "DAY_OF_WEEK" "HOUR"
## [13] "UCR_PART" "STREET" "Lat"
## [16] "Long" "Location"
In this step, we filter our large data set into a smaller, more focused data set including only our variables of interest: when and where.
#select subset from main data set
filteredData <- select(crimeData, OFFENSE_CODE_GROUP, OCCURRED_ON_DATE, Lat, Long)
#filter to only vandalism reports
vandalismData <- filter(filteredData, OFFENSE_CODE_GROUP == "Vandalism")
#preview data
head(vandalismData)
The variable names are a bit long so let’s shorten them into names that are easier to read and write.
#rename columns
#this will make working with the data easier
vandalismData <- rename(vandalismData, crime = OFFENSE_CODE_GROUP, date = OCCURRED_ON_DATE, lat = Lat, lon = Long)
#preview data
head(vandalismData)
class(vandalismData$date)
## [1] "factor"
#convert to date format
vandalismData$date<-as.Date(vandalismData$date)
class(vandalismData$date)
## [1] "Date"
Let’s drop the rows containing no data (if present).
#count the rows before dropping NA
#count(vandalismData, crime)
#drop rows containing NA
vandalismData %>% drop_na()
#count the rows after dropping
count(vandalismData, crime)
Now we can create a plot from our data. Through some trial plotting, it was discovered that vandalism reports varied with average temperature. Additional monthly temperature data could make for an interesting addition to our plot!
#add new columns for month and year alone. We can use these to filter perhaps.
vandalismData <- vandalismData %>% mutate(month = month(date), year = year(date))
#These temperature data represent the average monthly temp in F.
tempF<-c(36,39,45,56,66,76,81,80,72,61,51,41)
#Convert to C for science
tempC<-(tempF - 32) * (5/9)
#create a sequence to represent the months
month<-seq(1,12,by=1)
#create a dataframe to hold our new data
df<-data.frame(month,tempC)
#set up a plot with some custom options
plot<-ggplot(data=vandalismData, aes(x=month)) +
geom_bar(fill="#2D708EFF")+
coord_flip() +
theme_minimal() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14))+
scale_x_discrete(limits=seq(1,12,by=1),breaks=month,labels=c("January","February","March","April","May","June","July","August","September","October","November","December")) +
scale_y_continuous(breaks=seq(0,2000,by=200)) +
ggtitle("Boston vandalism reports, aggregated by month, 2015-2018 \n with average monthly temperature (°C)")+
theme(plot.title = element_text(hjust = 0.5, size=14,face="bold"))+
xlab("Month") +
ylab("Reports") +
labs(caption = "Data Sources: Kaggle.com, USClimateData.com") +
theme(plot.caption = element_text(color = "#777777FF", face = "italic"))+
geom_text(data=df, aes(x=month,y=(tempC*30),label=round(tempC,digits=0),hjust=0, vjust=0.5),col="#FFFFFFFF", fontface = "bold")
#draw the plot
plot
Next, let’s take a look at where vandalism is occurring in Boston.
#due to the dataset plotting some values southwest of Africa (seen in exploratory plots), we limit latitude values close to Boston
#takes us from 15810 to 15267 rows (-543)
vandalismData<-vandalismData %>% filter(between(lat, 41, 43))
#FILTERING to increase performance
#We might want ~15,000 data points on a map, but it might be excessive.
#We now introduce some limits to reduce the number of points to visualize.
#choose a specific year
selectedYear = 2018
#filter to the selected year
filterSelectedYear<-vandalismData[vandalismData$year == selectedYear,]
#determine how many data points to show on the map
#<!>-----------------------------------------------------------------------------<!>
# Increasing this number to high values may significantly increase rendering time
#<!>-----------------------------------------------------------------------------<!>
n = 1000
#randomly sample 'n' data points (rows) from the dataframe
subSelectedYear<-filterSelectedYear[sample(nrow(filterSelectedYear), n), ]
#set up the Leaflet map with some custom options for a clean map with simple, translucent points
m <- leaflet(subSelectedYear,width = "100%") %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
lng=subSelectedYear$lon, lat=subSelectedYear$lat,
radius = 5,
color = "#2D708EFF",
stroke = FALSE, fillOpacity = 0.5
) %>%
setView(lng = -71.093258, lat = 42.363196, zoom = 12)
m