library(tidyverse) #for data organization
library(ggplot2)   #for graphics
library(ggrepel)   #for preventing overlap of point labels
library(RCurl)     #remote access Google Drive .csv files

Session 01: A Pokemon Analysis

Introduction

For this example, we will be examining Pokemon statistics. You are a data scientist headed to a Tournament and you need to choose a team for a battle against an unknown opponent. How can you use data to help with your decision?

Data: Pokemon

Source: https://www.kaggle.com/abcsds/pokemon/downloads/pokemon.zip/2

Data Import

Our first step will be to import the data and do an initial observation of our variables. We would like to create a balanced four-member Pokemon team with high attack strength, defense strength, and hit points so that we can do our best in the tournament. (note: a second grader told me that “In real life, you can take as much (Pokemon) as you have.” We will create a four-member team.)

Quick Question: What kinds of data might we expect to collect on athletes or competitors of any kind?

#read in data set
#remember to setwd() so your code can find the data
#pokemonData <- as_tibble(read.csv("pokemon.csv")) 

#read .csv from Google Drive
url <- getURL("https://docs.google.com/spreadsheets/d/e/2PACX-1vSU2srufyhR96zu6iMFmJ-vf3h-3BpPpECK51WYhdTtdvHWNu68t1xNQIN8J3748WhxrSPeu07nwUsS/pub?output=csv")

#open connection
tc <- textConnection(url)

#import the data as a tibble
pokemonData <- as_tibble(read.csv(tc)) 

#preview data
head(pokemonData) 
#get the column names
colnames(pokemonData)
##  [1] "X."         "Name"       "Type.1"     "Type.2"     "Total"     
##  [6] "HP"         "Attack"     "Defense"    "Sp..Atk"    "Sp..Def"   
## [11] "Speed"      "Generation" "Legendary"

Abbreviation

The variable names are a bit long so let’s shorten them into something that’s easy to read/write.

#rename columns 
#this will make working with the data easier
filteredData <- rename(pokemonData, name = Name, type = Type.1, hp = HP, attack = Attack, defense = Defense, speed = Speed, legendary = Legendary)

Quick Question: What variables would we use to determine the composition of our team?

Filtering

In this step, we filter our large data set into a smaller, more focused data set including only our variables of interest.

#select subset from main data set
filteredData <- select(filteredData, name, type, hp, attack, defense, speed, legendary)

#preview data
head(filteredData) 

Dealing with NA

Let’s drop the rows containing no data (if present).

# #drop rows containing NA
filteredData %>% drop_na()
# 
# #count the rows after dropping
# count(vandalismData, crime) 

Quick Question: How can we visualize our data to help make a decision?

Plotting

Now we can create a plot from our data. Let’s see how all of the Pokemon compare.

#set up a plot with some custom options
#col="#2D708EFF"
plot<-ggplot(data=filteredData, aes(x=defense)) +
  geom_point(aes(y=attack, col=factor(type)), alpha = 1/5, size=8)+
  theme_minimal() +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14))+
  scale_x_continuous(breaks=seq(0,200,by=20)) +   
  scale_y_continuous(breaks=seq(0,200,by=20)) +
  ggtitle("Pokemon Comparison")+
  theme(plot.title = element_text(hjust = 0.5, size=14,face="bold"))+
  xlab("Defense") +
  ylab("Attack") +
  labs(caption = "Data Sources: Kaggle.com") +
  theme(plot.caption = element_text(color = "#777777FF", face = "italic"))+
  theme(legend.direction ="horizontal",legend.position = "bottom")+
  guides(fill=guide_legend(ncol=2))

#draw the plot
plot

High-level plots

This plot is a good high-level view of our data, but it is very complex and difficult to read for any useful information. Let’s look at focused plots for each type to see what other types might have good candidates.

p1 <- qplot(defense, attack, data = filteredData, size = hp, col=hp, alpha=0.1)
p1 + facet_wrap( ~ type, nrow = 4) + theme(legend.position = "none") +
  ggtitle("Low-detail, high-level plot to examine trends within and between Pokemon types")

Quick Question: Which groups contain possible candidates for our team? Which types should we have?

Plots with Focus

Let’s now filter to a single type and scale our points by the hit points (life strength) of our Pokemon.

#filter to only vandalism reports
egfData <- filteredData %>% filter(type %in% c("Electric"))

#set up a plot with some custom options
#col="#2D708EFF"
plot<-ggplot(data=egfData, aes(x=defense, size=hp)) +
  geom_point(aes(y=attack, fill=factor(type)), alpha = 1/2, pch=21, col="#000000")+
  scale_size(range = c(2, 40)) + 
  scale_fill_manual(values=c("#FFFF00")) +
  #geom_text(aes(label=name, x=defense, y=attack),hjust=0, vjust=0, size=5, position=position_dodge(0.8)) +
  geom_text_repel(aes(label=name, x=defense, y=attack),hjust=0, vjust=0, size=5)+
  theme_minimal() +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14))+
  scale_x_continuous(breaks=seq(0,200,by=20)) +   
  scale_y_continuous(breaks=seq(0,200,by=20)) +
  ggtitle("Pokemon Comparison")+
  theme(plot.title = element_text(hjust = 0.5, size=14,face="bold"))+
  xlab("Defense") +
  ylab("Attack") +
  labs(caption = "Data Sources: Kaggle.com") +
  theme(plot.caption = element_text(color = "#777777FF", face = "italic"))+
  theme(legend.direction ="horizontal",legend.position = "bottom")+
  guides(fill=guide_legend(ncol=2))

#draw the plot
plot

From this plot, it seems Ampharos and Mega Ampharos have a good balance of high defense strength, high attack strength, and a high number of hit points. We could add Mega Ampharos to the team to represent Electric types.

More Plots

Let’s take a look at Steel, Rock, and Dragon types to choose our other team members.

#filter to only vandalism reports
egfData <- filteredData %>% filter(type %in% c("Steel", "Rock", "Dragon"))

#set up a plot with some custom options
#col="#2D708EFF"
plot<-ggplot(data=egfData, aes(x=defense)) +
  geom_point(aes(y=attack, fill=factor(type)), alpha = 1/2, size=8, pch=21, col="#000000")+
  scale_fill_manual(values=c("#CCCCCC", "#664444", "#FF3333")) +
  geom_text_repel(aes(label=name, x=defense, y=attack),hjust=0, vjust=0, size=5)+
  theme_minimal() +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14))+
  scale_x_continuous(breaks=seq(0,200,by=20)) +   
  scale_y_continuous(breaks=seq(0,200,by=20)) +
  ggtitle("Pokemon Comparison")+
  theme(plot.title = element_text(hjust = 0.5, size=14,face="bold"))+
  xlab("Defense") +
  ylab("Attack") +
  labs(caption = "Data Sources: Kaggle.com") +
  theme(plot.caption = element_text(color = "#777777FF", face = "italic"))+
  theme(legend.direction ="horizontal",legend.position = "bottom")+
  guides(fill=guide_legend(ncol=2))

#draw the plot
plot

Quick Question: Are these colors a good choice? What could we change?

More Plots (continued)

Let’s take another look at Steel, Rock, and Dragon types to choose our other team members. We used a palette selection from https://venngage.com/blog/color-blind-friendly-palette/. We also scaled each data point by the number of HP so that bigger area represents more HP.

#filter to only vandalism reports
egfData <- filteredData %>% filter(type %in% c("Steel", "Rock", "Dragon"))

#set up a plot with some custom options
#col="#2D708EFF"
plot<-ggplot(data=egfData, aes(x=defense, size=hp)) +
  geom_point(aes(y=attack, fill=factor(type)), alpha = 1/2, pch=21, col="#000000")+
  scale_fill_manual(values=c("#F5793A", "#A95AA1", "#0F2080")) +
  scale_size(range = c(2, 40)) +
  geom_text_repel(aes(label=name, x=defense, y=attack),hjust=0, vjust=0, size=5)+
  theme_minimal() +
  theme(legend.text=element_text(size=18)) +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14))+
  scale_x_continuous(breaks=seq(0,200,by=20)) +   
  scale_y_continuous(breaks=seq(0,200,by=20)) +
  ggtitle("Pokemon Comparison")+
  theme(plot.title = element_text(hjust = 0.5, size=14,face="bold"))+
  xlab("Defense") +
  ylab("Attack") +
  labs(caption = "Data Sources: Kaggle.com") +
  theme(plot.caption = element_text(color = "#777777FF", face = "italic"))+
  theme(legend.direction ="horizontal",legend.position = "bottom")+
  guides(fill=guide_legend(ncol=2))

#draw the plot
plot

Quick Question: Which four Pokemon would YOU take to the tournament?