This workshop Disaster Data for Demographic Research is funded by CACHE https://agingclimatehealth.org/
-Put a “#” in front of a line to comment it out -tidyverse is a popular group of packages that includes dplyr, ggplot2 (referred to as just ggplot by many), and more -tidyverse uses a coding convention that uses pipes (%>%) to chain together functions in an organized manner -Check out the variety of cheatsheets for RStudio, Rmarkdown and various tidyverse packages including for data visualization with ggplot https://rstudio.github.io/cheatsheets/
-Rmarkdown documents can be “knit” to create a nice output document. This Rmarkdown document is set to output an HTML document when you knit it but other options include a Word document or PDF - we use HTML to best preserve the animations. CAUTION: Knitting can take quite a bit of time as it will run the entire code from start to finish (likely >10 minutes for this document due to the animations, potentially longer depending on your computer and your additions to the code) so we do not recommend using the “Knit on Save” feature and suggest waiting to knit until the end. -Rmarkdown documents have text (like this sentence) and code chunks (that start and end with three backticks ` ). -You can create a new code chunk by adding these symbols with {r } after the initial three backticks or by clicking the button that has a green + in a white circle with a white C in a green box. -You can run code chunks by selecting and running code as normal or using the buttons at the top right of each code chunk. The green right-pointing arrow runs that code chunk and the grey down-pointing arrow with a green line under it runs all code chunks above that code chunk. -Note that when running the code, many outputs will show up below the code chunk rather than in the Console or the Plots panel as you might be used to (with the exception of animations, which show in the Viewer panel). -Look at the top right of the code where it says “Outline” (under “Publish”) - you can click on that and use it as a Table of Contents to skip around. The hashtags (#, ##, and ###) throughout the document outside of the code chunks create the outline structure. -You can use the little triangles next to line numbers to minimize whole sections (arrows next to lines with # or ## at the start) or a chunk of code (arrows next to the start of a chunk).
#Load a shapefile using package tigris (requires internet connection)
#counties_sc = counties("SC")
#another option: download county-level file from NHGIS or Census <https://catalog.data.gov/dataset/tiger-line-shapefile-2022-state-south-carolina-sc-county-subdivision>.
# to read in a shapefile
counties_sc<-st_read("counties_sc.shp")
## Reading layer `counties_sc' from data source
## `/Users/Jenna/Desktop/Data/PAA_MIM/counties_sc.shp' using driver `ESRI Shapefile'
## Simple feature collection with 46 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -83.35393 ymin: 31.99595 xmax: -78.4993 ymax: 35.21548
## Geodetic CRS: NAD83
# to save a shapefile
#st_write(counties_sc, "counties_sc.shp")
#investigate CRS (Coordinate Reference System aka the map projection)
st_crs(counties_sc)
## Coordinate Reference System:
## User input: NAD83
## wkt:
## GEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["latitude",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["longitude",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4269]]
#Visualize the counties - later we will learn how to add labels
ggplot(counties_sc)+geom_sf()
#Load a shapefile using package tigris (requires internet connection)
#pumas_sc<-pumas("SC")
# to read in a shapefile
pumas_sc<-st_read("pumas_sc.shp")
## Reading layer `pumas_sc' from data source
## `/Users/Jenna/Desktop/Data/PAA_MIM/pumas_sc.shp' using driver `ESRI Shapefile'
## Simple feature collection with 41 features and 10 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -83.35393 ymin: 31.99595 xmax: -78.4993 ymax: 35.21548
## Geodetic CRS: NAD83
# to save a shapefile
#st_write(pumas_sc, "pumas_sc.shp")
#investigate CRS (Coordinate Reference System aka the map projection)
st_crs(pumas_sc)
## Coordinate Reference System:
## User input: NAD83
## wkt:
## GEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["latitude",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["longitude",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4269]]
#Visualize the PUMAS
ggplot(pumas_sc)+geom_sf()
Draw counties in black and PUMAs in red.
ggplot() +
geom_sf(data = counties_sc, aes(color = "Counties"), size = 0.5, fill = NA) +
geom_sf(data = pumas_sc, aes(color = "PUMAs"), linetype = "dotted", fill = NA) +
scale_color_manual(name = "Boundaries",
values = c("Counties" = "black", "PUMAs" = "red")) +
theme_classic() +
theme(legend.position = "bottom")
ggplot(counties_sc)+
geom_sf()+geom_sf_label(aes(label=NAME), size = 2)+
#to adjust label type & size: to change variable of label, replace NAME with variable of interest and to make font size bigger, change size=2 to size=3 or 4
theme_void(base_size=20)
#read in a CSV (if get an error that you can't find the file, try setting a working directory or making sure the CSV is in the same folder as the .Rmd file)
SHELDUS_sc<-read.csv("SHELDUS_sc.csv")
#explore how many records per county
table(SHELDUS_sc$county_fips)
##
## 45001 45003 45005 45007 45009 45011 45013 45015 45017 45019 45021 45023 45025
## 31 42 29 52 27 37 52 45 31 64 38 30 48
## 45027 45029 45031 45033 45035 45037 45039 45041 45043 45045 45047 45049 45051
## 36 40 45 35 51 28 37 47 53 59 38 38 60
## 45053 45055 45057 45059 45061 45063 45065 45067 45069 45071 45073 45075 45077
## 41 42 51 39 24 53 34 43 44 38 48 45 53
## 45079 45081 45083 45085 45087 45089 45091
## 64 37 57 41 38 44 47
#explore how many records per year across all counties (barplot of records per year)
ggplot(SHELDUS_sc, aes(x=year))+geom_bar()
#explore the population variable from the SHELDUS data over the whole state
#Note: The variable Population in the SHELDUS data correspond to counties that experienced a disaster. If a given county did not experience any disasters in a given year, the Population cell will be missing because there is no SHELDUS record.
SHELDUS_sc%>%
filter(hazard=="All"& year>2012 & year<=2022)%>%
group_by(year)%>%
summarize(pop=sum(population))%>%
ggplot(aes(x=year, y=pop))+
geom_line()+
scale_y_continuous(labels = scales::comma)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2012,2021,1))+
ylab("Total Pop in SC (based on SHELDUS, some missingness)")+
xlab("Year")
#explore the population variable from the SHELDUS data by county
popcountyyear<-SHELDUS_sc%>%
filter(hazard=="All"&year>2012 & year<=2022)%>%
group_by(year, county_fips)%>%
summarize(pop=sum(population))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(popcountyyear, aes(x=year, y=pop, group = county_fips, color=county_fips))+
geom_line()+
geom_label(aes(label=county_fips),size=3, data=popcountyyear%>%filter(year==2013), nudge_x = -0.8)+
theme_classic()+ #removes gridlines
theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::comma)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2012,2021,1))+
ylab("Total Pop in County (SHELDUS, note: missingness)")+
xlab("Year")
#explore some variables of interest
table(SHELDUS_sc$fatalities, useNA = "always")
##
## 0 1 2 3 4 5 9 <NA>
## 1770 150 31 11 9 3 2 0
table(SHELDUS_sc$exposure, useNA = "always")
##
## 0 1 <NA>
## 1339 637 0
table(SHELDUS_sc$hazard, useNA = "always")
##
## All Dry_hazards Flood Hurricane Storm <NA>
## 837 15 257 45 822 0
#Summarize some info by county over the entire time period
countysummary <-SHELDUS_sc%>%
filter(year>2012 & year<=2022)%>% #get years 2012-2021 for ACS years 2011-2020
group_by(county_fips)%>%
dplyr::summarize(count=n(), #get a count of hazards
hazardtypes=length(unique(hazard))-1, #how many types of hazards, subtract 1 for "All" category
fatalities=sum(fatalities[hazard=="All"]),
totalloss=sum(totalloss[hazard=="All"])
)
#Summarize some info county by year (only "All" records)
countyyearsummary <-SHELDUS_sc%>%
filter(hazard=="All" & year>2012 & year<=2022)%>%
arrange(year)%>%
group_by(county_fips)%>%
complete(year=2013:2022)%>% #add rows/observations for counties without any records for a specific year
fill(county_fips, .direction = "updown")%>% #fill in county code to maintain these added rows for missing years
ungroup()%>%
mutate(fatalities= if_else(is.na(fatalities),0,fatalities), #fills in added county-years with 0s
exposure= if_else(is.na(exposure),0,exposure))
table(countyyearsummary$exposure)
##
## 0 1
## 356 104
#Note: to get output to appear individually, separate out code into smaller chunks
#check fields to merge on, must be the same type (also note: can run into issues with leading zeros with integers)
str(countysummary$county_fips) #check the structure of the variable (tells you variable type)
## int [1:46] 45001 45003 45005 45007 45009 45011 45013 45015 45017 45019 ...
str(counties_sc$GEOID) #check the structure of the variable (tells you variable type)
## chr [1:46] "45023" "45081" "45089" "45035" "45015" "45047" "45069" "45009" ...
#Note: county_fips and GEOID are two different types of variables, so in addition to renaming county_fips we transform the variable type to match so a join can be accomplished
countysummary<-countysummary%>%
mutate(GEOID = as.character(county_fips))
#join countysummary
countiessummary <- left_join(counties_sc, countysummary, by = "GEOID")
#join countyyearsummary
countyyearsummary<-countyyearsummary%>%
mutate(GEOID = as.character(county_fips))
countiesyearsummary <- left_join(counties_sc, countyyearsummary, by = "GEOID")
#Map the count # of records using a continuous scale
ggplot(countiessummary) +
geom_sf(aes(fill=count), color = "#000000")+ #"color" sets the border color
geom_sf_label(aes(label=count), size = 2)+ #size controls the size of the label, name gives the number of records can label with GEOID to identify specific counties if you wish
theme_void() + #removes axes
scale_fill_gradientn(colors = c("#A5D1E1","#191970"), name = "# of SHELDUS \n Records") #adds colors (Hex codes) and label of axes, \n adds line break
#distribution of fatalities by county
ggplot(countiessummary, aes(x=fatalities))+
geom_histogram(binwidth = 1) #binwidth controls the width of the histogram bars/bins, can also specify # of bins with "bins=10" to make 10 bins for example
#Map the # of fatalities using discrete color bins
ggplot(countiessummary) +
geom_sf(aes(fill=fatalities), color = "#000000")+ #"color" sets the border color
geom_sf_label(aes(label=fatalities), size = 3)+ #adds labels of # of fatalities, size controls label size
theme_void() + #removes axes
#scale_fill_gradientn(colors = c("#A7C7E7","#191970"), breaks= c(5,10,20), name = "# Fatalities over period")
scale_fill_stepsn(colors = c("#A7C7E7","#134990","#191950"), breaks= c(1,5,20)) #adds colors (Hex codes) and specifies numbers to break at
#Total Loss quintiles
#distribution of total loss over the period by county
ggplot(countiessummary, aes(x=totalloss))+
geom_histogram()+
scale_x_continuous(labels = scales::comma)+ #adds commas on Y axis
ylab("Frequency (count)")+
xlab("Total Loss")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Here is one of several ways to classify a variable into quantiles
library(classInt)
breaks_qt <- classIntervals(c(min(countiessummary$totalloss) - .00001, countiessummary$totalloss), #sets min and max
n = 5, style = "quantile") #n specifies how many, style specifies method used
countiessummary<-countiessummary%>%
mutate(quantiles = cut(totalloss, breaks_qt$brks)) #creates a new variable quantiles using the cutoffs specified in breaks_qt
#Map the total loss over the period using quintiles discrete color bins
ggplot(countiessummary) +
geom_sf(aes(fill=quantiles), color = "#000000")+ #"color" sets the border color
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+
theme_void() + #removes axes
scale_fill_brewer(palette = "PuBu",
labels= c("Low","","","","High"), #if you want the numbers, comment out this line by putting a # in front, and if you want to use the cutoffs specified in breaks_qt
)
plot<- countiesyearsummary%>%
ggplot() +
geom_sf(aes(fill=fatalities), color = "#000000")+ #"color" sets the border color
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+
theme_void() + #removes axes
#scale_fill_gradientn(colors = c("#A7C7E7","#191970"), breaks= c(5,10,20), name = "# Fatalities over period")
scale_fill_stepsn(colors = c("#A7C7E7","#134990","#191950"), breaks= c(1,5,20))+
transition_states(year, transition_length = 1, state_length = 1, wrap=T) +
labs(title = "Fatalities by County: {closest_state}", caption="\n\nCode: CACHE PAA Workshop \nSource: SHELDUS \n",fill = "# Fatalities")
animate(plot, nframes = 10, fps=1, renderer = gifski_renderer()) #note the default is nframes=100, fps=10 if anything seems off (reduced the # of frames to reduce rendering time for the sake of the workshop)
#anim_save("fatalities_animation.gif") #how you would save
ggplot(countiesyearsummary, aes(x=totalloss))+
geom_histogram()+ #creates histogram
scale_x_continuous(labels = scales::comma) #adds commas on Y axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot<- countiesyearsummary%>%
ggplot() +
geom_sf(aes(fill=totalloss), color = "#000000")+ #"color" sets the border color
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+
theme_void() + #removes axes
#scale_fill_gradientn(colors = c("#A7C7E7","#191970"), breaks= c(5,10,20), name = "# Fatalities over period")
scale_fill_stepsn(colors = c("#A7C7E7","#134960","#191950"), breaks= c(50000,10000000,100000000), labels = c("$50K","$10M","$100M"))+
transition_states(year, transition_length = 1, state_length = 1, wrap=T) +
labs(title = "Total Loss by County: {closest_state}", caption="\nCode: CACHE PAA Workshop \nSource: SHELDUS \n",fill = "Total Loss")
animate(plot, nframes = 10, fps=1, renderer = gifski_renderer()) #note the default is nframes=100, fps=10 if anything seems off (reduced the # of frames to reduce rendering time for the sake of the workshop)
#anim_save("totalloss_animation.gif")
table(countiesyearsummary$exposure, countiesyearsummary$year)
##
## 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
## 0 37 35 26 29 31 42 41 29 42 44
## 1 9 11 20 17 15 4 5 17 4 2
plot<- countiesyearsummary%>%
mutate(exposed = if_else(exposure==1,"Exposed","Not Exposed"))%>% #create a factor variable "exposed" to visualize exposure (0 & 1) as categorical with nice labels (can calculate this before if helpful)
ggplot() + #creates a ggplot
geom_sf(aes(fill=exposed, group = GEOID), color = "#000000")+ #"color" sets the border color #add group=GEOID to keep the counties stationary (otherwise the animation may move them around)
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+ #adds a label
theme_void() + #removes axes
scale_fill_manual(values = c("grey", "red"), labels = c("Not Exposed", "Exposed")) + # Discrete fill scale
transition_states(year, transition_length = 1, state_length = 1, wrap=T) + #tells animation to animate over year
labs(title = "Exposure by County: {closest_state}", caption="\nCode: CACHE PAA Workshop \nSource: SHELDUS \n",fill = "Exposure") #adds titles and labels
animate(plot, nframes = 10, fps=1, renderer = gifski_renderer()) #note the default is nframes=100, fps=10 if anything seems off (reduced the # of frames to reduce rendering time for the sake of the workshop)
#anim_save("exposure_animation.gif")
##Your turn!
Working with the ACS microdata
# List all relevant CSV files in the folder (naming convention was "merged_SC_YEAR.csv")
file_list <- list.files(pattern = "^merged_SC_\\d{4}\\.csv$", full.names = TRUE)
# Read and combine all CSV files
microdata <- bind_rows(lapply(file_list, read_csv))
#preview the data and look at variables
head(microdata)
## # A tibble: 6 Ă— 114
## ...1 County_FIPS Year STATEFIP PUMA YEAR SAMPLE SERIAL CBSERIAL HHWT
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 '45001' 2013 45 1600 2012 201201 1101716 1153805 99
## 2 2 '45001' 2013 45 1600 2012 201201 1085428 36985 40
## 3 3 '45001' 2013 45 1600 2012 201201 1086273 94825 47
## 4 4 '45001' 2013 45 1600 2012 201201 1100539 1071660 21
## 5 5 '45001' 2013 45 1600 2012 201201 1086925 138922 77
## 6 6 '45001' 2013 45 1600 2012 201201 1099000 966229 77
## # ℹ 104 more variables: CLUSTER <dbl>, STRATA <dbl>, GQ <dbl>, NUMPREC <dbl>,
## # HHTYPE <dbl>, REGION <dbl>, STATEICP <dbl>, COUNTYICP <dbl>,
## # COUNTYFIP <dbl>, DENSITY <dbl>, METRO <dbl>, PCTMETRO <dbl>, MET2013 <dbl>,
## # OWNERSHP <dbl>, OWNERSHPD <dbl>, TAXINCL <dbl>, PROPINSR <dbl>,
## # RENTGRS <dbl>, COSTELEC <dbl>, HHINCOME <dbl>, VALUEH <dbl>,
## # LINGISOL <dbl>, FUELHEAT <dbl>, VEHICLES <dbl>, NFAMS <dbl>, MULTGEN <dbl>,
## # MULTGEND <dbl>, PERNUM <dbl>, PERWT <dbl>, FAMSIZE <dbl>, SFTYPE <dbl>, …
New variables have been appended to the end of the dataset. You can see the number of columns will change.
microdata<-microdata%>%
#transmute( ) #use this if wish to create a new dataframe with only the new columns below (if you use this, it is recommended to also change the name of the df)
mutate(
GEOID = gsub("'","",County_FIPS), #remove the ' from the variable County_FIPS and rename it GEOID for easy merge with the counties_sc shapefile
YEAR_ACS= YEAR,
YEAR_SHELDUS= Year,
person=paste0(SERIAL,PERNUM),#calculate weight accounting for duplications due to the county-PUMA crosswalk
final_weight=afact*PERWT,
gender= case_when(
SEX==1 ~ 'Male',
SEX==2 ~ 'Female',),
race_cat = case_when(
RACE == 1 ~ "White",
RACE == 2 ~ "Black/African American",
RACE == 3 ~ "American Indian or Alaska Native",
RACE == 4 | RACE == 5 | RACE == 6 ~ "Asian",
RACE == 7 ~ "Other",
RACE == 8 | RACE == 9 ~ "Two or more major races",
TRUE ~ NA_character_
),
race_cat = as.factor(race_cat),
hisp_cat = case_when(
HISPAN == 0 ~ "Not Hispanic",
HISPAN == 1 | HISPAN == 2 |HISPAN == 3|HISPAN == 4 ~ "Hispanic",
HISPAN == 9 ~ NA,
TRUE ~ NA_character_
),
raceeth_cat = case_when(
hisp_cat == "Hispanic" ~ 'Hispanic',
hisp_cat == "Not Hispanic" & race_cat == "White" ~ 'Non-hispanic White',
hisp_cat == "Not Hispanic" & race_cat == "Black/African American" ~ 'Non-hispanic Black',
hisp_cat == "Not Hispanic" & race_cat == "Asian" ~ 'Non-hispanic Asian',
hisp_cat == "Not Hispanic" & race_cat == "Other" ~ 'Non-hispanic Other',
hisp_cat == "Not Hispanic" & race_cat == "American Indian or Alaska Native" ~ 'American Indian/Alaska Native',
hisp_cat == "Not Hispanic" & race_cat == "Two or more major races" ~ 'Non-hispanic two+ races',
),
age_cat = case_when(
AGE >= 0 & AGE <= 4 ~ "A. Age: 0-4",
AGE >= 5 & AGE <= 17 ~ "B. Age: 5-17",
AGE >= 18 & AGE <= 24 ~ "C. Age: 18-24",
AGE >= 25 & AGE <= 34 ~ "D. Age: 25-34",
AGE >= 35 & AGE <= 44 ~ "E. Age: 35-44",
AGE >= 45 & AGE <= 64 ~ "F. Age: 45-64",
AGE >= 65 & AGE <= 84 ~ "G. Age: 65-84",
AGE >= 85 ~ "H. Age: 85+",
TRUE ~ NA_character_
),
age_cat = as.factor(age_cat),
diffcare_recoded = case_when(
DIFFCARE == 1 ~ "No",
DIFFCARE == 2 ~ "Yes",
DIFFCARE == 0 ~ "Unknown",
TRUE ~ NA_character_
),
diffcare_recoded = factor(diffcare_recoded, levels = c("Unknown","No","Yes")),
diffmob_recoded = case_when(
DIFFMOB == 1 ~ "No",
DIFFMOB == 2 ~ "Yes",
DIFFMOB == 0 ~ "Unknown",
TRUE ~ NA_character_
),
diffmob_recoded = factor(diffmob_recoded, levels = c("Unknown","No","Yes")),
diffphys_recoded = case_when(
DIFFPHYS == 1 ~ "No",
DIFFPHYS == 2 ~ "Yes",
DIFFPHYS == 0 ~ "Unknown",
TRUE ~ NA_character_
),
diffrem_recoded = case_when(
DIFFREM == 1 ~ "No",
DIFFREM == 2 ~ "Yes",
DIFFREM == 0 ~ "Unknown",
TRUE ~ NA_character_
),
diffsens_recoded = case_when(
DIFFSENS == 1 ~ "No",
DIFFSENS == 2 ~ "Yes",
DIFFSENS == 0 ~ "Unknown",
TRUE ~ NA_character_
),
)
colnames(microdata) #new variables have been appended to the end of the dataset
## [1] "...1" "County_FIPS"
## [3] "Year" "STATEFIP"
## [5] "PUMA" "YEAR"
## [7] "SAMPLE" "SERIAL"
## [9] "CBSERIAL" "HHWT"
## [11] "CLUSTER" "STRATA"
## [13] "GQ" "NUMPREC"
## [15] "HHTYPE" "REGION"
## [17] "STATEICP" "COUNTYICP"
## [19] "COUNTYFIP" "DENSITY"
## [21] "METRO" "PCTMETRO"
## [23] "MET2013" "OWNERSHP"
## [25] "OWNERSHPD" "TAXINCL"
## [27] "PROPINSR" "RENTGRS"
## [29] "COSTELEC" "HHINCOME"
## [31] "VALUEH" "LINGISOL"
## [33] "FUELHEAT" "VEHICLES"
## [35] "NFAMS" "MULTGEN"
## [37] "MULTGEND" "PERNUM"
## [39] "PERWT" "FAMSIZE"
## [41] "SFTYPE" "SFRELATE"
## [43] "RELATE" "RELATED"
## [45] "SEX" "AGE"
## [47] "MARST" "RACE"
## [49] "RACED" "HISPAN"
## [51] "HISPAND" "BPL"
## [53] "BPLD" "YRIMMIG"
## [55] "YRSUSA1" "YRSUSA2"
## [57] "HCOVANY" "SCHOOL"
## [59] "EDUC" "EDUCD"
## [61] "EMPSTAT" "EMPSTATD"
## [63] "OCC" "IND"
## [65] "INCTOT" "FTOTINC"
## [67] "INCWAGE" "INCBUS00"
## [69] "INCSS" "INCWELFR"
## [71] "INCINVST" "INCRETIR"
## [73] "INCSUPP" "INCOTHER"
## [75] "INCEARN" "POVERTY"
## [77] "HWSEI" "MIGRATE1"
## [79] "MIGRATE1D" "MIGPUMA1"
## [81] "MOVEDIN" "VETDISAB"
## [83] "DIFFREM" "DIFFPHYS"
## [85] "DIFFMOB" "DIFFCARE"
## [87] "DIFFSENS" "DIFFEYE"
## [89] "DIFFHEAR" "GCRESPON"
## [91] "county" "stab"
## [93] "CountyName" "PUMA12name"
## [95] "pop20" "IntPtLat"
## [97] "IntPtLon" "afact"
## [99] "CropDmg_ALL" "CropDmg(ADJ 2022)_ALL"
## [101] "CropDmgPerCapita(ADJ 2022)_ALL" "PropertyDmg_ALL"
## [103] "PropertyDmg(ADJ 2022)_ALL" "PropertyDmgPerCapita(ADJ 2022)_ALL"
## [105] "Injuries_ALL" "InjuriesPerCapita_ALL"
## [107] "Fatalities_ALL" "FatalitiesPerCapita_ALL"
## [109] "Duration_Days_ALL" "Fatalities_Duration_ALL"
## [111] "Injuries_Duration_ALL" "Property_Damage_Duration_ALL"
## [113] "Crop_Damage_Duration_ALL" "Records_ALL"
## [115] "GEOID" "YEAR_ACS"
## [117] "YEAR_SHELDUS" "person"
## [119] "final_weight" "gender"
## [121] "race_cat" "hisp_cat"
## [123] "raceeth_cat" "age_cat"
## [125] "diffcare_recoded" "diffmob_recoded"
## [127] "diffphys_recoded" "diffrem_recoded"
## [129] "diffsens_recoded"
popbyyearsc<-microdata%>%
group_by(YEAR_ACS)%>%
dplyr::summarize(pop_total=sum(final_weight, na.rm=T))
popbyyear<-microdata%>%
group_by(YEAR_ACS, GEOID)%>%
dplyr::summarize(pop_total=sum(final_weight, na.rm=T))
## `summarise()` has grouped output by 'YEAR_ACS'. You can override using the
## `.groups` argument.
#Line graph by year
ggplot(popbyyearsc, aes(x=YEAR_ACS, y=pop_total))+
geom_line()+ #adds line graph
#geom_label(aes(label=GEOID),size=3, data=popbyyear%>%filter(YEAR_ACS==2012), nudge_x = -0.8)+
theme_classic()+ #removes gridlines
theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::comma)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2012,2021,1))+
ylab("Total Population in SC")+
xlab("Year")
#Line graph by county and by year
ggplot(popbyyear, aes(x=YEAR_ACS, y=pop_total, color=GEOID))+
geom_line()+ #adds line graph
geom_label(aes(label=GEOID),size=3, data=popbyyear%>%filter(YEAR_ACS==2012), nudge_x = -0.8)+
theme_classic()+ #removes gridlines
theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::comma)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2012,2021,1))+
ylab("Total Population in County")+
xlab("Year")
#Line graph by county and by year but each county gets its own graph (facet) with a variable y-axis
ggplot(popbyyear, aes(x=YEAR_ACS, y=pop_total))+
geom_line()+ #adds line graph
facet_wrap(~GEOID, scales = "free_y")+ #facet_wrap() or facet_grid() allow for faceting by county GEOID
#scales = "free_y" allows for flexible y-axis
theme_classic()+ #theme_classic sets the theme to one that removes gridlines
theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::comma)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2012,2021,1))+
ylab("Total Population in County")+
xlab("Year")
stats_by_countyyear <- microdata %>%
group_by(GEOID, YEAR_ACS) %>%
dplyr::summarize(
pop_total = sum(final_weight, na.rm = TRUE),
pop_over_65 = sum(final_weight[AGE >= 65], na.rm = TRUE),
pop_over_65_diffphys = sum(final_weight[AGE >= 65 & diffphys_recoded=="Yes"], na.rm = TRUE),
pop_over_65_NOdiffphys = sum(final_weight[AGE >= 65 & diffphys_recoded=="No"], na.rm = TRUE),
pct_over_65 = (pop_over_65 / pop_total) * 100,
pct_over_65_diffphys = (pop_over_65_diffphys / (pop_over_65_diffphys+ pop_over_65_NOdiffphys)) * 100,
pct_over65diffphys_total = (pop_over_65_diffphys / pop_total) * 100,
) %>%
ungroup()
## `summarise()` has grouped output by 'GEOID'. You can override using the
## `.groups` argument.
#Note: could also use the following code to calculate percentages if you first convert the variable to a binary numeric variable where 0=no and 1=yes
#within dplyr::summarize
# pct_diffphys_weighted = weighted.mean(diffphys_binarytest, final_weight, na.rm = TRUE)
view(stats_by_countyyear)
countiesstats <- left_join(counties_sc, stats_by_countyyear, by = "GEOID")
countiesstats <- countiesstats%>%
mutate(pop_density = pop_total / ((ALAND)/1000000)) #calculate a new variable pop_density
#note: ALAND is in square meters so we divide by 1M to convert to square km
#View a summary of the distribution of the new variable
summary(countiesstats$pop_density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.515 16.846 30.369 58.113 77.352 265.218
#Look at pop data for 2012 (or all years by commenting out that line) to get sense of distribution
countiesstats%>%
filter(YEAR_ACS==2012)%>% #have to add a year for static graph, could comment out to see all years
ggplot(aes(pop_total)) +
geom_histogram()+
scale_x_continuous(labels = scales::comma)+ #adds commas on x axis
xlab("Total Population in County")+
ylab("Frequency (count)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Create a faceted map of county pop by year
countiesstats%>%
#filter(YEAR_ACS==2012)%>% #can look at only a year for static map
ggplot() +
geom_sf(aes(fill=pop_density), color = "#000000")+ #"color" sets the border color
facet_wrap(~YEAR_ACS)+ #make a graph for each year
#geom_sf_label(aes(label=scales::comma(round(pop_total))), size = 3)+ #scales::comma adds commas in label, round() rounds to an integer (decimal estimates due to ACS weights and the PUMA-county crosswalk), and size=3 controls the size of the label
theme_void() + #removes axes
scale_fill_stepsn(colors = c("#A5D1E1","#191970"), name = "Population Density",breaks= c(50, 100, 150, 200))
#Create a map of population density
countiesstats%>%
filter(YEAR_ACS==2012)%>% #have to add a year for static map
ggplot() +
geom_sf(aes(fill=pop_density), color = "#000000")+ #"color" sets the border color
geom_sf_label(aes(label=scales::comma(round(pop_density))), size = 3)+ #scales::comma adds commas in label, round() rounds to an integer (decimal estimates due to ACS weights and the PUMA-county crosswalk), and size=3 controls the size of the label
theme_void()+ #removes axes
scale_fill_stepsn(colors = c("#A5D1E1","#191970"), name = "Population Density in 2012",breaks= c(50, 100, 150, 200))
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data
plot<- countiesstats%>%
ggplot() +
geom_sf(aes(fill=pop_total), color = "#000000")+ #"color" sets the border color
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+
theme_void() + #removes axes
scale_fill_stepsn(colors = c("#A5D1E1","#191970"),breaks= c(100000,150000,200000,250000,300000), labels = c("100K","150K","200K","250K","300K"))+
transition_states(YEAR_ACS, transition_length = 1, state_length = 1, wrap=T) +
labs(title = "Population by County: {closest_state}", caption="\n\nCode: CACHE PAA Workshop \nSource: Census Bureau \n American Community Survey",fill = "Population")
animate(plot, renderer = gifski_renderer()) #note the default is nframes=100, fps=10 if anything seems off (reduced the # of frames to reduce rendering time for the sake of the workshop)
countiesstats%>%
filter(YEAR_ACS==2012)%>% #have to add a year for static graph, could comment out to see all years or facet to see year by year
ggplot(aes(pop_over_65)) +
geom_histogram()+
scale_x_continuous(labels = scales::comma)+ #adds commas on x axis
xlab("Population over 65 per county")+
ylab("Frequency (count)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot<- countiesstats%>%
ggplot() +
geom_sf(aes(fill=pop_over_65), color = "#000000")+ #"color" sets the border color
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+
theme_void() + #removes axes
scale_fill_stepsn(colors = c("#A5D1E1","#191970"),breaks= c(10000,15000,20000,25000,30000,40000,50000,80000), labels = c("10K","15K","20K","25K","30K","40K","50K","80K"))+
transition_states(YEAR_ACS, transition_length = 1, state_length = 1, wrap=T) +
labs(title = "Population over 65 by County: {closest_state}", caption="\n\nCode: CACHE PAA Workshop \nSource: Census Bureau \n American Community Survey",fill = "Population \nover 65")
animate(plot, renderer = gifski_renderer()) #note the default is nframes=100, fps=10 if anything seems off (reduced the # of frames to reduce rendering time for the sake of the workshop)
countiesstats%>%
ggplot(aes(pct_over65diffphys_total)) +
geom_histogram()+ #creates a histogram
facet_wrap(~YEAR_ACS)+ #creates a graph per year
xlab("% of total population who are over 65 and have a physical disability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot<- countiesstats%>%
ggplot() +
geom_sf(aes(fill=pct_over65diffphys_total), color = "#000000")+ #"color" sets the border color
#geom_sf_label(aes(label=GEOID), label.size = 0.5)+
theme_void() + #removes axes
scale_fill_stepsn(colors = c("#A5D1E1","#191970"),breaks= c(3,4,5,6), labels = c("3%","4%","5%","6%"))+
transition_states(YEAR_ACS, transition_length = 1, state_length = 1, wrap=T) +
labs(title = " % over 65 with physical disability by County: {closest_state}", caption="\n\nCode: CACHE PAA Workshop \nSource: Census Bureau \nAmerican Community Survey",fill = "% over 65 with \nphysical disability")
animate(plot, renderer = gifski_renderer()) #note the default is nframes=100, fps=10 if anything seems off (reduced the # of frames to reduce rendering time for the sake of the workshop)
Note: These two datsets have already been linked in the data provided. For the sake of this workshop, we do not show here the attribute table join (linking or merging two datasets on a matched field/variable) of the two datasets but plan to make that code available in the future on the CACHE website. The merge was accomplished by matching the year and the county FIPS.
STILL REMAINING: ACS and SHELDUS analyses together
Note: you can change the per-capita threshold of $4.1 to something greater
microdata <- microdata%>%
group_by(GEOID, YEAR_ACS) %>%
mutate(
pop_total = sum(final_weight, na.rm = TRUE)
) %>%
fill(pop_total, .direction = "updown")%>%
ungroup()%>%
mutate(
CropDmg = ifelse(is.na(`CropDmg(ADJ 2022)_ALL`), 0, `CropDmg(ADJ 2022)_ALL`),
PropertyDmg = ifelse(is.na(`PropertyDmg(ADJ 2022)_ALL`), 0, `PropertyDmg(ADJ 2022)_ALL`),
Fatalities = ifelse(is.na(Fatalities_ALL), 0, Fatalities_ALL),
TotalLoss = ((CropDmg + PropertyDmg) + (Fatalities*11400000)),
TotalLoss_percapita = TotalLoss / pop_total , #note that the population total used here is from the ACS weights from the PUMA-County crosswalk which is similar but slightly different from the population used for the SHELDUS records which is county population for years in which there is an event. One could join in a table from the Census Bureau with the complete records of population by county by year if of interest.
exposed = if_else(TotalLoss_percapita >= 4.1, 1, 0) #using the 2022 FEMA county-level threshold
)
Look at exposure by year by age group
exposurebyyear <- microdata %>%
group_by(YEAR_SHELDUS) %>% #note: Year is SHELDUS disaster year not ACS year
dplyr::summarize(
pct_over_65_exposed = sum(final_weight[AGE >= 65 & exposed==1], na.rm = TRUE) / sum(final_weight[AGE >= 65], na.rm = TRUE),
pct_under_65_exposed = sum(final_weight[AGE < 65 & exposed==1], na.rm = TRUE) / sum(final_weight[AGE < 65], na.rm = TRUE),
pct_over_65_NOphysdiff_exposed = sum(final_weight[AGE >= 65 & exposed==1 & diffphys_recoded=="No"], na.rm = TRUE) / sum(final_weight[AGE >= 65 & diffphys_recoded=="No"], na.rm = TRUE),
pct_under_65_NOphysdiff_exposed = sum(final_weight[AGE < 65 & exposed==1 & diffphys_recoded=="No"], na.rm = TRUE) / sum(final_weight[AGE < 65 & diffphys_recoded=="No"], na.rm = TRUE),
pct_over_65_physdiff_exposed = sum(final_weight[AGE >= 65 & exposed==1 & diffphys_recoded=="Yes"], na.rm = TRUE) / sum(final_weight[AGE >= 65 & diffphys_recoded=="Yes"], na.rm = TRUE),
pct_under_65_physdiff_exposed = sum(final_weight[AGE < 65 & exposed==1 & diffphys_recoded=="Yes"], na.rm = TRUE) / sum(final_weight[AGE < 65 & diffphys_recoded=="Yes"], na.rm = TRUE),
) %>%
ungroup()
#make a plot
ggplot(exposurebyyear, aes(x=YEAR_SHELDUS))+
geom_line( aes(y=pct_over_65_exposed, color="Over 65"))+ #adds line graph
geom_line( aes(y=pct_under_65_exposed, color="Under 65"))+ #adds line graph
scale_color_manual(name = "Group",
values = c("Over 65" = "blue", "Under 65" = "grey")) +
theme_classic()+ #theme_classic sets the theme to one that removes gridlines
#theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::percent)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2013,2022,1))+
ylab("% exposed")+
xlab("Year")
## Calculate the proportion of population by Age group and physical
difficulty status who were exposed in a given year
Look at exposure by year by age group and physical difficulty status
ggplot(exposurebyyear, aes(x=YEAR_SHELDUS))+
geom_line( aes(y=pct_over_65_NOphysdiff_exposed, color="Over 65"))+ #adds line graph
geom_line( aes(y=pct_under_65_NOphysdiff_exposed, color="Under 65"))+ #adds line graph
geom_line( aes(y=pct_over_65_physdiff_exposed, color="Over 65 with a physical disability"))+ #adds line graph
geom_line( aes(y=pct_under_65_physdiff_exposed, color="Under 65 with a physical disability"))+ #adds line graph
scale_color_manual(name = "Group",
values = c("Over 65" = "darkred", "Under 65" = "darkblue", "Over 65 with a physical disability" = "red", "Under 65 with a physical disability" = "blue")) +
theme_classic()+ #theme_classic sets the theme to one that removes gridlines
#theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::percent)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2013,2022,1))+
ylab("% exposed")+
xlab("Year")
## Calculate the proportion of population by Race/ethnicity category
& physical difficulty status who were exposed in a given year
Look at exposure by year by race/ethnicity category and physical difficulty status
table(microdata$raceeth_cat)
##
## American Indian/Alaska Native Hispanic
## 6099 62893
## Non-hispanic Asian Non-hispanic Black
## 20137 383610
## Non-hispanic Other Non-hispanic two+ races
## 2596 32881
## Non-hispanic White
## 1073715
exposureRaceEth <- microdata %>%
group_by(YEAR_SHELDUS) %>% #note: Year is SHELDUS disaster year not ACS year
dplyr::summarize(
pct_NHWhite_NOphysdiff_exposed = sum(final_weight[raceeth_cat== "Non-hispanic White" & exposed==1 & diffphys_recoded=="No"], na.rm = TRUE) / sum(final_weight[raceeth_cat=="Non-hispanic White"], na.rm = TRUE),
pct_NHBlack_NOphysdiff_exposed = sum(final_weight[raceeth_cat=="Non-hispanic Black" & exposed==1 & diffphys_recoded=="No"], na.rm = TRUE) / sum(final_weight[raceeth_cat=="Non-hispanic Black"], na.rm = TRUE),
pct_NHWhite_physdiff_exposed = sum(final_weight[raceeth_cat=="Non-hispanic White" & exposed==1 & diffphys_recoded=="Yes"], na.rm = TRUE) / sum(final_weight[raceeth_cat=="Non-hispanic White" & diffphys_recoded=="Yes"], na.rm = TRUE),
pct_NHBlack_physdiff_exposed = sum(final_weight[raceeth_cat=="Non-hispanic Black" & exposed==1 & diffphys_recoded=="Yes"], na.rm = TRUE) / sum(final_weight[raceeth_cat=="Non-hispanic Black" & diffphys_recoded=="Yes"], na.rm = TRUE),
) %>%
ungroup()
ggplot(exposureRaceEth, aes(x=YEAR_SHELDUS))+
geom_line( aes(y=pct_NHWhite_NOphysdiff_exposed, color="NHWhite without disability"))+ #adds line graph
geom_line( aes(y=pct_NHBlack_NOphysdiff_exposed, color="NHBlack without disability"))+ #adds line graph
geom_line( aes(y=pct_NHWhite_physdiff_exposed, color="NHWhite with a physical disability"))+ #adds line graph
geom_line( aes(y=pct_NHBlack_physdiff_exposed, color="NHBlack with a physical disability"))+ #adds line graph
scale_color_manual(name = "Group",
values = c("NHWhite without disability" = "darkgreen", "NHBlack without disability" = "darkred", "NHWhite with a physical disability" = "green", "NHBlack with a physical disability" = "red")) +
theme_classic()+ #theme_classic sets the theme to one that removes gridlines
#theme(legend.position="none")+ #removes legend
scale_y_continuous(labels = scales::percent)+ #adds commas on Y axis
scale_x_continuous(breaks=seq(2013,2022,1))+
ylab("% over 65 exposed")+
xlab("Year")
## Your turn!
Remember to save your document! You can knit it if you’d like an HTML version (or Word or PDF) but please note that knitting may take >10 minutes depending on your computer.