::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) knitr
Homework 3 - Mapping
Data Source
The UFO data set contains over 80,000 reports of UFO sightings over the last century. This data includes entries where the location of the sighting was not found or blank (0.8146%) or have an erroneous or blank time (8.0237%). (Kaggle Link)
# load data set and packages
library(ggplot2)
library(dplyr)
library(ggdendro)
library(viridis)
library(mapproj)
library(tidyverse)
library(usdata)
library(plotly)
setwd("~/Desktop/STAA566/Assignment3")
<- read.csv("complete.csv", header = T)
ufo # grab only data points for the US
<- ufo[ufo$country == "us",]
ufo # remove data points without a state & duration
<- ufo[!(is.na(ufo$state) | ufo$state ==""),]
ufo <- ufo[!(is.na(ufo$duration..seconds.) | ufo$duration..seconds. ==""),]
ufo # convert state to upper case - for matching on US Map data later
$state <- toupper(ufo$state) ufo
Spatial Units Being Displayed
In graph 1 - the total number of sightings in 2014 by state is being displayed.
In graph 2 - the average duration of sighting in seconds by state in 2014 is being displayed.
What I am trying to communicate
That just because a state has many sightings it doesn’t mean that they have a higher duration (in seconds) of sightings. Also that despite previous beliefs, states on the coast have more sightings than the midwest states like Nebraska or Oklahoma.
What decisions I made and why I made them to best communicate the data
I manually created a column for Year since originally we are given a Date & Time field combined. This is so we can display data for the most recent year in the data set.
I also cleaned up any misentered data, it looked like given the data around those points, they somehow got coded as sequential serial numbers.
Code and final figure
First graph - total number of sightings
# get data summarizations
<- ufo %>%
ufoSummary group_by(state, year) %>%
summarise(Total = n(), AvgDurSeconds = mean(duration..seconds.))
# get cordinates for the map
<- map_data("state")
tmp_map # create a state abbreviation column since thats what is in our data set
$abbr <- usdata::state2abbr(tmp_map$region)
tmp_map<- tmp_map %>%
us_states_ufo mutate(region = str_to_title(region),
subregion = str_to_title(subregion),
abbr = str_to_upper(abbr)) %>%
left_join(ufoSummary %>% filter(year == max(year)), by = c("abbr" = "state"))
# double check the lift join worked as expected
%>% head(n=14) us_states_ufo
long lat group order region subregion abbr year Total
1 -87.46201 30.38968 1 1 Alabama <NA> AL 2014 47
2 -87.48493 30.37249 1 2 Alabama <NA> AL 2014 47
3 -87.52503 30.37249 1 3 Alabama <NA> AL 2014 47
4 -87.53076 30.33239 1 4 Alabama <NA> AL 2014 47
5 -87.57087 30.32665 1 5 Alabama <NA> AL 2014 47
6 -87.58806 30.32665 1 6 Alabama <NA> AL 2014 47
7 -87.59379 30.30947 1 7 Alabama <NA> AL 2014 47
8 -87.59379 30.28655 1 8 Alabama <NA> AL 2014 47
9 -87.67400 30.27509 1 9 Alabama <NA> AL 2014 47
10 -87.81152 30.25790 1 10 Alabama <NA> AL 2014 47
11 -87.88026 30.24644 1 11 Alabama <NA> AL 2014 47
12 -87.92037 30.24644 1 12 Alabama <NA> AL 2014 47
13 -87.95475 30.24644 1 13 Alabama <NA> AL 2014 47
14 -88.00632 30.24071 1 14 Alabama <NA> AL 2014 47
AvgDurSeconds
1 502.8936
2 502.8936
3 502.8936
4 502.8936
5 502.8936
6 502.8936
7 502.8936
8 502.8936
9 502.8936
10 502.8936
11 502.8936
12 502.8936
13 502.8936
14 502.8936
# build graph
<- ggplot(data = us_states_ufo,
p_ufo_state mapping = aes(x = long, y = lat, group = group, fill = Total,
text = paste("Total Number of UFO Sightings in 2014:", Total)))
<- p_ufo_state + geom_polygon(color="white")
p_ufo_state <- p_ufo_state + ggdendro::theme_dendro()
p_ufo_state <- p_ufo_state + scale_fill_viridis(option="magma", direction=-1)
p_ufo_state <- p_ufo_state + guides(fill=guide_legend(title="Total Number of \nUFO Sightings in 2014"))
p_ufo_state <- p_ufo_state + coord_map()
p_ufo_state
ggplotly(p_ufo_state, tooltip = "text")
Second graph - average duration
<- ggplot(data = us_states_ufo,
p_ufo_state_dur mapping = aes(x = long, y = lat, group = group, fill = AvgDurSeconds,
text = paste("Average Duration of UFO Sightings in 2014:",
round(AvgDurSeconds), " seconds")))
<- p_ufo_state_dur + geom_polygon(color="white")
p_ufo_state_dur <- p_ufo_state_dur + ggdendro::theme_dendro()
p_ufo_state_dur <- p_ufo_state_dur + scale_fill_viridis(option="magma", direction=-1)
p_ufo_state_dur <- p_ufo_state_dur + guides(fill=guide_legend(title="Average Duration (in seconds) \nof UFO Sightings in 2014"))
p_ufo_state_dur <- p_ufo_state_dur + coord_map()
p_ufo_state_dur
ggplotly(p_ufo_state_dur, tooltip = "text")