library(tidyverse) #for data organization
library(lubridate) #for date manipulation
library(scales)    #for date display
library(ggplot2)   #for graphics
library(leaflet)   #for maps

Session 01: A Map of Boston Vandalism, 2018

Introduction

For this example, we will be examining vandalism in Boston, Massachusetts, USA and will create a map highlighting incidents of vandalism. Our data were sourced from Kaggle.com.

Data: Crimes in Boston

Source: https://www.kaggle.com/ankkur13/boston-crime-data/downloads/boston-crime-data.zip/3

Quick Question: What do you want to know about vandalism in Boston?

Data Import

Our first step will be to import the data and do an initial observation of our variables.

#read in data set
#remember to setwd() so your code can find the data
crimeData <- as_tibble(read.csv("crime.csv")) 

#preview data
head(crimeData) 
#get the column names
colnames(crimeData)
##  [1] "INCIDENT_NUMBER"     "OFFENSE_CODE"        "OFFENSE_CODE_GROUP" 
##  [4] "OFFENSE_DESCRIPTION" "DISTRICT"            "REPORTING_AREA"     
##  [7] "SHOOTING"            "OCCURRED_ON_DATE"    "YEAR"               
## [10] "MONTH"               "DAY_OF_WEEK"         "HOUR"               
## [13] "UCR_PART"            "STREET"              "Lat"                
## [16] "Long"                "Location"

Filtering

In this step, we filter our large data set into a smaller, more focused data set including only our variables of interest: when and where.

#select subset from main data set
filteredData <- select(crimeData, OFFENSE_CODE_GROUP, OCCURRED_ON_DATE, Lat, Long)

#filter to only vandalism reports
vandalismData <- filter(filteredData, OFFENSE_CODE_GROUP == "Vandalism")

#preview data
head(vandalismData) 

Abbreviation

The variable names are a bit long so let’s shorten them into names that are easier to read and write.

#rename columns 
#this will make working with the data easier
vandalismData <- rename(vandalismData, crime = OFFENSE_CODE_GROUP, date = OCCURRED_ON_DATE, lat = Lat, lon = Long)

#preview data
head(vandalismData) 
class(vandalismData$date)
## [1] "factor"
#convert to date format
vandalismData$date<-as.Date(vandalismData$date)
class(vandalismData$date)
## [1] "Date"

Dealing with NA

Let’s drop the rows containing no data (if present).

#count the rows before dropping NA
#count(vandalismData, crime) 

#drop rows containing NA
vandalismData %>% drop_na()
#count the rows after dropping
count(vandalismData, crime) 

Plotting

Now we can create a plot from our data. Through some trial plotting, it was discovered that vandalism reports varied with average temperature. Additional monthly temperature data could make for an interesting addition to our plot!

Quick Question: What do you anticipate the trend to show? What are some ways to represent that information?

#add new columns for month and year alone. We can use these to filter perhaps.
vandalismData <- vandalismData %>% mutate(month = month(date), year = year(date))

#These temperature data represent the average monthly temp in F.
tempF<-c(36,39,45,56,66,76,81,80,72,61,51,41) 

#Convert to C for science
tempC<-(tempF - 32) * (5/9)

#create a sequence to represent the months
month<-seq(1,12,by=1)

#create a dataframe to hold our new data
df<-data.frame(month,tempC)

#set up a plot with some custom options
plot<-ggplot(data=vandalismData, aes(x=month)) + 
  geom_bar(fill="#2D708EFF")+
  coord_flip() +
  theme_minimal() +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14))+
  scale_x_discrete(limits=seq(1,12,by=1),breaks=month,labels=c("January","February","March","April","May","June","July","August","September","October","November","December")) +
  scale_y_continuous(breaks=seq(0,2000,by=200)) +
  ggtitle("Boston vandalism reports, aggregated by month, 2015-2018 \n with average monthly temperature (°C)")+
  theme(plot.title = element_text(hjust = 0.5, size=14,face="bold"))+
  xlab("Month") +
  ylab("Reports") +
  labs(caption = "Data Sources: Kaggle.com, USClimateData.com") +
  theme(plot.caption = element_text(color = "#777777FF", face = "italic"))+
  geom_text(data=df, aes(x=month,y=(tempC*30),label=round(tempC,digits=0),hjust=0, vjust=0.5),col="#FFFFFFFF", fontface = "bold")

#draw the plot
plot

Maps

Next, let’s take a look at where vandalism is occurring in Boston.

Quick Question: What qualities does a “good” map have?

#due to the dataset plotting some values southwest of Africa (seen in exploratory plots), we limit latitude values close to Boston
#takes us from 15810 to 15267 rows (-543) 
vandalismData<-vandalismData %>% filter(between(lat, 41, 43))

#FILTERING to increase performance
#We might want ~15,000 data points on a map, but it might be excessive. 
#We now introduce some limits to reduce the number of points to visualize.

#choose a specific year
selectedYear = 2018

#filter to the selected year
filterSelectedYear<-vandalismData[vandalismData$year == selectedYear,]

#determine how many data points to show on the map
#<!>-----------------------------------------------------------------------------<!>
# Increasing this number to high values may significantly increase rendering time
#<!>-----------------------------------------------------------------------------<!>
n = 1000

#randomly sample 'n' data points (rows) from the dataframe
subSelectedYear<-filterSelectedYear[sample(nrow(filterSelectedYear), n), ]

#set up the Leaflet map with some custom options for a clean map with simple, translucent points
m <- leaflet(subSelectedYear,width = "100%") %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    lng=subSelectedYear$lon, lat=subSelectedYear$lat,
    radius = 5,
    color = "#2D708EFF",
    stroke = FALSE, fillOpacity = 0.5
  ) %>%
  setView(lng = -71.093258, lat = 42.363196, zoom = 12) 


m

Quick Question: Let’s make a list of conclusions we can draw from this map (good and bad).

Quick Question: Which of our conclusions are actually supported and valid?