Browse Source

Initial commit

master
mikebader 4 years ago
commit
c22b6c0c95
  1. 3
      .gitattributes
  2. 6
      .gitignore
  3. 34
      dcarea_functions.R
  4. 24
      tracts/1980/tabular/race-ethnicity/construction/create-tracts-1980TIGER-race-ethnicity.R
  5. BIN
      tracts/1980/tabular/race-ethnicity/dataset/tracts-1980TIGER-race-ethnicity.csv
  6. 78
      tracts/2010/tabular/age-race-sex/construction/tracts-2010TIGER-age-race-sex.R
  7. BIN
      tracts/2010/tabular/age-race-sex/dataset/tracts-2010TIGER-age-race-sex.csv
  8. 50
      tracts/2010/tabular/children-present/construction/construct-tracts-2010TIGER-children-pres.R
  9. BIN
      tracts/2010/tabular/children-present/dataset/tracts-2010TIGER-children-present.csv
  10. 53
      tracts/2010/tabular/educ-attainment/construction/construct-tracts-2010TIGER-educ-attainment.R
  11. BIN
      tracts/2010/tabular/educ-attainment/dataset/tracts-2010TIGER-educ-attainment.csv
  12. 59
      tracts/2010/tabular/foreign-born/construction/construct-tracts-2010TIGER-foreign-born.R
  13. BIN
      tracts/2010/tabular/foreign-born/dataset/tracts-2010TIGER-foreign-born.csv
  14. 52
      tracts/2010/tabular/marital-status/construction/construct-tracts-2010TIGER-marital-status.R
  15. BIN
      tracts/2010/tabular/marital-status/dataset/tracts-2010TIGER-marital-status.csv
  16. 42
      tracts/2010/tabular/median-age/construction/construct-tracts-2010TIGER-median-age.R
  17. BIN
      tracts/2010/tabular/median-age/dataset/tracts-2010TIGER-median-age.csv
  18. 63
      tracts/2010/tabular/race-ethnicity/construction/construct-create-tracts-2010TIGER-race-ethnicity.R
  19. BIN
      tracts/2010/tabular/race-ethnicity/dataset/tracts-2010TIGER-race-ethnicity.csv

3
.gitattributes

@ -0,0 +1,3 @@
*.csv filter=lfs diff=lfs merge=lfs -text
*.shp filter=lfs diff=lfs merge=lfs
*.zip filter=lfs diff=lfs merge=lfs

6
.gitignore

@ -0,0 +1,6 @@
*.shp
*.zip
.Rhistory
.venv

34
dcarea_functions.R

@ -0,0 +1,34 @@
## Functions to create data for DC Area
## (DC & surrounding jurisdictions, including independent cities)
select.dcarea <- function(dta) {
## Arguments: `dta`: dataset from from which to select observations in
## the DC Area
## Returns: dataset containing only observations from the DC area
## Identify counties to keep in DC Area dataset using FIPS county codes
counties <- c(
'110001' # D.C.
, '240031' # Montgomery County
, '240033' # Prince George's County
, '510013' # Arlington County
, '510059' # Fairfax County
, '510510' # Alexandria city
, '510600' # Fairfax city
, '510610' # Falls Church city
)
## Select counties using regular expression based on FIPS codes above
## and common `GISJOIN` variable
re <- paste0('^G', counties, collapse = '|')
dcarea <- dta[grep(re, dta$GISJOIN, perl=TRUE),]
## Replace factor variables to contain only levels in the DC area
if('COUNTY' %in% names(dcarea)) {
dcarea$COUNTY <- factor(dcarea$COUNTY)
}
if('STATE' %in% names(dcarea)) {
dcarea$STATE <- factor(dcarea$STATE)
}
return(dcarea)
}

24
tracts/1980/tabular/race-ethnicity/construction/create-tracts-1980TIGER-race-ethnicity.R

@ -0,0 +1,24 @@
## File creates file containing race & ethnicity variables at the
## tract level for DC Area from 1980 Census STF-1 data
## Set up environment
library(tidyverse)
## Identify race variable names
races <- c('totpop', 'nhw', 'nhb', 'api', 'hsp', 'oth', 'two')
praces <- paste0('p', races[-1])
racevars <- c(races, praces)
## Identify variables to keep that are not race variables
nonracevars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
load('../US/tracts/1980/tabular/race-ethnicity/dataset/tracts-1980TIGER-race-ethnicity.Rdata')
## Select DC-area tracts
racedta <- select.dcarea(trt80_re)
## Write file containing wide dataset to CSV format
write.csv(racedta, 'tracts/1980/tabular/race-ethnicity/dataset/tracts-1980TIGER-race-ethnicity.csv')

BIN
tracts/1980/tabular/race-ethnicity/dataset/tracts-1980TIGER-race-ethnicity.csv (Stored with Git LFS)

size 242504

78
tracts/2010/tabular/age-race-sex/construction/tracts-2010TIGER-age-race-sex.R

@ -0,0 +1,78 @@
## Creates single file containing median variables at the
## tract level for DC Area from the 2010 Census and 2010-2014 through
## 2013-2017 American Community Surveys for tracts in the DC Area
## Set up environment
library(tidyverse)
## Identify variables to keep representing age-race-sex variables
### Abbreviations for racial groups
races <- c(
'blk' ## Black alone
, 'ami' ## American Indian or Native Alaskan alone
, 'asi' ## Asian alone
, 'pac' ## Native Hawaiian or Other Pacific Islander alone
, 'oth' ## Other race alone
, 'two' ## Two or more races
, 'hsp' ## Hispanic or Latino (of any race)
, 'nhw' ## Non-Hispanic White
)
### Age group categories
agegrps <- c(
'0004'
, '0509'
, '1014'
, '1517'
, '1819'
, '2024'
, '2529'
, '3034'
, '3544'
, '4554'
, '5564'
, '6574'
, '7584'
, '85up'
)
### Sex categories
sexes <- c('m', 'f')
arsvars <- as.vector(sapply(races, paste0, sapply(sexes, paste0, agegrps)))
## Identify variables to keep that are not race variables
geovars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Create dataset that contains only DC-area tracts and constructed
## age-race-sex variables
select.arsvars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(geovars, arsvars)])
}
## Load datasets containing median age variables from 2010 TIGER/Line files
load('../US/tracts/2010/tabular/age-race-sex/dataset/tracts-2010TIGER-age-race-sex.Rdata')
## Create list of data frames for each year and restrict to created
## age-race-sex variables
dtas <- list(trt10_ars, trt14_ars, trt15_ars, trt16_ars, trt17_ars)
arsdtas <- lapply(dtas, select.arsvars)
## Create single wide data frame with year appended to variable name
suffixes <- c('10', 14:17)
namelist <- lapply(suffixes, function(x) c(geovars, paste0(arsvars, x)))
arsdta <- mapply(setNames, arsdtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
arsdta$STATE <- arsdta$STATE.x
arsdta$COUNTY <- arsdta$COUNTY.x
arsdta <- arsdta[, -(grep('^STATE.+|^COUNTY.+', names(arsdta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(arsdta, 'tracts/2010/tabular/age-race-sex/dataset/tracts-2010TIGER-age-race-sex.csv')

BIN
tracts/2010/tabular/age-race-sex/dataset/tracts-2010TIGER-age-race-sex.csv (Stored with Git LFS)

size 2653606

50
tracts/2010/tabular/children-present/construction/construct-tracts-2010TIGER-children-pres.R

@ -0,0 +1,50 @@
## Creates single file containing household presence of children variables
## for tracts based on 2010 Census and the 2010-2014 through 2013-2017
## American Community Surveys for tracts in the DC Area
##
## tothh = Total households
## chpr = Households with children present
## ncpr = Households with no children present
## Set up environment
library(tidyverse)
## Identify variable abbreviations
presence <- c('tothh', 'chpr', 'ncpr')
ppresence <- paste0('p',presence[-1])
presvars <- c(presence, ppresence)
geovars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Create dataset that contains only DC-area tracts and constructed
## foreign-born variables
select.presvars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(geovars, presvars)])
}
## Load datasets containing children present variables from files based on
## 2010 tracts
load('../US/tracts/2010/tabular/children-present/dataset/tracts-2010TIGER-children-present.Rdata')
## Create list of data frames for each year and restrict to created
## children present variables
dtas <- list(trt10c_cp, trt14_cp, trt15_cp, trt16_cp, trt17_cp)
presdtas <- lapply(dtas, select.presvars)
## Create single wide data frame with year appended to variable name
namelist <- lapply(c('10c', 14:17),
function(x) c(geovars, paste0(presvars, x)))
presdta <- mapply(setNames, presdtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
presdta$STATE <- presdta$STATE.x
presdta$COUNTY <- presdta$COUNTY.x
presdta <- presdta[, -(grep('^STATE.+|^COUNTY.+', names(presdta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(presdta, 'tracts/2010/tabular/children-present/dataset/tracts-2010TIGER-children-present.csv')

BIN
tracts/2010/tabular/children-present/dataset/tracts-2010TIGER-children-present.csv (Stored with Git LFS)

size 292289

53
tracts/2010/tabular/educ-attainment/construction/construct-tracts-2010TIGER-educ-attainment.R

@ -0,0 +1,53 @@
## Creates single file containing educational attainment variables at the
## 2010-2014 through 2013-2017 American Community Surveys for tracts
## in the DC Area
## tot25o = Total population 25 and older
## lh = Less than high school
## hs = High school or GED
## sc = Some college
## aa = Associate's degree
## ba = Bachelor's degree
## gr = Graduate degree
## Set up environment
library(tidyverse)
## Identify variable abbreviations
attain <- c('tot25o', 'lh', 'hs', 'sc', 'aa', 'ba', 'gr')
pattain <- paste0('p',attain[-1])
attainvars <- c(attain, pattain)
geovars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Create dataset that contains only DC-area tracts and constructed
## educational attainment variables
select.attainvars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(geovars, attainvars)])
}
## Load datasets containing educational attainment variables from files based on
## 2010 tracts
load('../US/tracts/2010/tabular/educ-attainment/dataset/tracts-2010TIGER-educ-attainment.Rdata')
## Create list of data frames for each year and restrict to created
## educational attainmnet variables
dtas <- list(trt14_ed, trt15_ed, trt16_ed, trt17_ed)
attaindtas <- lapply(dtas, select.attainvars)
## Create single wide data frame with year appended to variable name
namelist <- lapply(14:17,
function(x) c(geovars, paste0(attainvars, x)))
attaindta <- mapply(setNames, attaindtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
attaindta$STATE <- attaindta$STATE.x
attaindta$COUNTY <- attaindta$COUNTY.x
attaindta <- attaindta[, -(grep('^STATE.+|^COUNTY.+', names(attaindta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(attaindta, 'tracts/2010/tabular/educ-attainment/dataset/tracts-2010TIGER-educ-attainment.csv')

BIN
tracts/2010/tabular/educ-attainment/dataset/tracts-2010TIGER-educ-attainment.csv (Stored with Git LFS)

size 591056

59
tracts/2010/tabular/foreign-born/construction/construct-tracts-2010TIGER-foreign-born.R

@ -0,0 +1,59 @@
## Creates single file containing foreign-born variables at the
## 2010-2014 through 2013-2017 American Community Surveys for tracts
## in the DC Area
## fbpop = Foreign-born population
## eur = European foreign-born
## asi = Asian foreign-born
## afr = African foreign-born
## oce = Oceania foreign-born
## lat = Latin American foreign-born
## Note `p` variables represent *proportion of foreign-born residents*
##
## Note: Remainder of foreign born are from North America, i.e.:
## fbpop - sum(eur + asi + afr + oce + lat) = N. American foreign-born
## Set up environment
library(tidyverse)
## Identify variable abbreviations
regions <- c('fbpop', 'eur','asi','afr','oce','lat')
pregions <- paste0('p',regions[-1])
regionvars <- c(regions, pregions)
geovars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Create dataset that contains only DC-area tracts and constructed
## foreign-born variables
select.regionvars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(geovars, regionvars)])
}
## Load datasets containing foreign-born variables from files based on
## 2010 tracts
load('../US/tracts/2010/tabular/foreign-born/dataset/tracts-2010TIGER-foreign-born.Rdata')
## Create list of data frames for each year and restrict to created
## foreign-born variables
dtas <- list(trt14_fb, trt15_fb, trt16_fb, trt17_fb)
regiondtas <- lapply(dtas, select.regionvars)
## Create single wide data frame with year appended to variable name
namelist <- lapply(14:17,
function(x) c(geovars, paste0(regionvars, x)))
regiondta <- mapply(setNames, regiondtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
regiondta$STATE <- regiondta$STATE.x
regiondta$COUNTY <- regiondta$COUNTY.x
regiondta <- regiondta[, -(grep('^STATE.+|^COUNTY.+', names(regiondta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(regiondta, 'tracts/2010/tabular/foreign-born/dataset/tracts-2010TIGER-foreign-born.csv')

BIN
tracts/2010/tabular/foreign-born/dataset/tracts-2010TIGER-foreign-born.csv (Stored with Git LFS)

size 418196

52
tracts/2010/tabular/marital-status/construction/construct-tracts-2010TIGER-marital-status.R

@ -0,0 +1,52 @@
## Creates single file containing median variables at the
## 2010-2014 through 2013-2017 American Community Surveys for tracts
## in the DC Area
## Set up environment
library(tidyverse)
## Identify marital status variable names
statuses <- c('mar', 'nvm', 'wid', 'div')
pstatuses <- paste0('p', statuses)
statusvars <- c(statuses, pstatuses)
## Identify geographic identifiers to keep
geovars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Define function to create dataset that contains only DC-area tracts and
## constructed marital status variables
select.statusvars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(geovars, statusvars)])
}
## Load datasets containing marital status variables using data from
## files based on 2010 tracts
load('../US/tracts/2010/tabular/marital-status/dataset/tracts-2010TIGER-marital-status.Rdata')
## Create list of data frames for each year and restrict to created
## marital status variables
dtas <- list(trt14_ms, trt15_ms, trt16_ms, trt17_ms)
statusdtas <- lapply(dtas, select.statusvars)
## Create single wide data frame with year appended to variable name
namelist <- lapply(14:17, function(x) c(geovars, paste0(statusvars, x)))
statusdta <- mapply(setNames, statusdtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
statusdta$STATE <- statusdta$STATE.x
statusdta$COUNTY <- statusdta$COUNTY.x
statusdta <- statusdta[, -(grep('^STATE.+|^COUNTY.+', names(statusdta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(statusdta,
'tracts/2010/tabular/marital-status/dataset/tracts-2010TIGER-marital-status.csv')

BIN
tracts/2010/tabular/marital-status/dataset/tracts-2010TIGER-marital-status.csv (Stored with Git LFS)

size 403477

42
tracts/2010/tabular/median-age/construction/construct-tracts-2010TIGER-median-age.R

@ -0,0 +1,42 @@
## Creates single file containing median variables at the
## tract level for DC Area from the 2010 Census and 2010-2014 through
## 2013-2017 American Community Surveys for tracts in the DC Area
## Set up environment
library(tidyverse)
agevar <- 'mdage'
## Identify variables to keep that are not race variables
geovars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Create dataset that contains only DC-area tracts and constructed
## race variables
select.agevars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(geovars, agevar)])
}
## Load datasets containing median age variables from 2010 TIGER/Line files
load('../US/tracts/2010/tabular/median-age/dataset/tracts-2010TIGER-median-age.Rdata')
## Create list of data frames for each year and restrict to created race
## variables
dtas <- list(trt10c_ag, trt14_ag, trt15_ag, trt16_ag, trt17_ag)
agedtas <- lapply(dtas, select.agevars)
## Create single wide data frame with year appended to variable name
suffixes <- c('10c', 14:17)
namelist <- lapply(suffixes, function(x) c(geovars, paste0(agevar, x)))
agedta <- mapply(setNames, agedtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
agedta$STATE <- agedta$STATE.x
agedta$COUNTY <- agedta$COUNTY.x
agedta <- agedta[, -(grep('^STATE.+|^COUNTY.+', names(agedta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(agedta, 'tracts/2010/tabular/median-age/dataset/tracts-2010TIGER-median-age.csv')

BIN
tracts/2010/tabular/median-age/dataset/tracts-2010TIGER-median-age.csv (Stored with Git LFS)

size 78786

63
tracts/2010/tabular/race-ethnicity/construction/construct-create-tracts-2010TIGER-race-ethnicity.R

@ -0,0 +1,63 @@
## File creates single file containing race & ethnicity variables at the
## tract level for DC Area from the 2010 Census and 2007-2011 through
## 2011-2015 American Community Surveys.
##
## Also includes variables indicating whether a neighborhood qualifies as a
## "quadrivial" neighborhood (Asians, blacks, Latinxs, and whites each make
## up at least 10% of the tract and no group is a majority) in each year.
## Set up environment
library(tidyverse)
## Identify race variable names
races <- c('totpop', 'nhw', 'nhb', 'api', 'hsp', 'oth', 'two')
praces <- paste0('p', races[-1])
racevars <- c(races, praces)
## Identify variables to keep that are not race variables
nonracevars <- c('GISJOIN', 'STATE', 'COUNTY')
## Set Functions
source('dcarea_functions.R')
## Create dataset that contains only DC-area tracts and constructed
## race variables
select.racevars <- function(dta) {
dta <- select.dcarea(dta)
dta$GISJOIN <- as.character(dta$GISJOIN)
return(dta[, c(nonracevars, racevars)])
}
## Identify quadrivial neighborhoods
id.quads <- function(dta) {
eligible <- sapply(dta[, praces[1:4]], function(x) x >= .10 & x < .5)
dta['quad'] <- apply(eligible, 1, all)
return(dta)
}
# ## Sets variable names for a specific dataset in the `racedtas` list
# set.varnames <- function(x,y) setNames(racedtas[[x]], y)
## Load datasets containing race-ethnicity variables from 2010 TIGER/Line files
## for US tracts 2010
load('../US/tracts/2010/tabular/race-ethnicity/dataset/tracts-2010TIGER-race-ethnicity.Rdata')
## Create list of data frames for each year and restrict to created race
## variables
dtas <- list(trt10_re, trt11_re, trt12_re, trt13_re, trt14_re, trt15_re,
trt16_re, trt17_re)
racedtas <- lapply(dtas, select.racevars) %>%
lapply(id.quads)
## Create single wide data frame with year appended to variable name
namelist <- lapply(10:17,
function(x) c(nonracevars, paste0(c(racevars, 'quad'), x)))
racedta <- mapply(setNames, racedtas, namelist, SIMPLIFY = FALSE) %>%
reduce(left_join, by='GISJOIN')
racedta$STATE <- racedta$STATE.x
racedta$COUNTY <- racedta$COUNTY.x
racedta <- racedta[, -(grep('^STATE.+|^COUNTY.+', names(racedta), perl=TRUE))]
## Write file containing wide dataset to CSV format
write.csv(racedta, 'tracts/2010/tabular/race-ethnicity/dataset/tracts-2010TIGER-race-ethnicity.csv')

BIN
tracts/2010/tabular/race-ethnicity/dataset/tracts-2010TIGER-race-ethnicity.csv (Stored with Git LFS)

size 1114494
Loading…
Cancel
Save