Important Packages/Libraries for this Lab:
options(repos = c(CRAN = "https://cloud.r-project.org"))
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
if(!require("sandwich")) install.packages("sandwich")
## Loading required package: sandwich
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
if(!require("dotwhisker")) install.packages("dotwhisker")
## Loading required package: dotwhisker
library(dotwhisker) # coefplots
if(!require("broom")) install.packages("broom")
## Loading required package: broom
library(broom) # coefplots
if(!require("margins")) install.packages("margins")
## Loading required package: margins
library(margins) # coefplots
rm(list = ls())
source("/Volumes/middfiles/Classes/Fall23/ECON0211A/Noe_Labs/NV.R") #load NV.R to access nv_sum() function for quick summary statistic tables
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
Read Data
usa77<-read_csv("IPUMS_USA_1970_2017.csv")
## Rows: 945183 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): sample, gq, sex, race, raced, hispan, hispand, educ, educd, empsta...
## dbl (8): year, serial, cbserial, hhwt, pernum, perwt, age, incwage
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa77
## # A tibble: 945,183 × 23
## year sample serial cbserial hhwt gq pernum perwt sex age race raced
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 1970 1970 … 14 NA 587 hous… 1 587 male 41 white white
## 2 1970 1970 … 14 NA 587 hous… 2 587 fema… 38 white white
## 3 1970 1970 … 37 NA 587 hous… 1 587 male 30 white white
## 4 1970 1970 … 37 NA 587 hous… 2 587 fema… 27 white white
## 5 1970 1970 … 55 NA 587 hous… 1 587 male 27 white white
## 6 1970 1970 … 61 NA 587 hous… 1 587 male 45 white white
## 7 1970 1970 … 61 NA 587 hous… 2 587 fema… 46 white white
## 8 1970 1970 … 78 NA 587 hous… 1 587 male 51 white white
## 9 1970 1970 … 78 NA 587 hous… 2 587 fema… 49 white white
## 10 1970 1970 … 84 NA 587 hous… 1 587 male 55 white white
## # ℹ 945,173 more rows
## # ℹ 11 more variables: hispan <chr>, hispand <chr>, educ <chr>, educd <chr>,
## # empstat <chr>, empstatd <chr>, labforce <chr>, wkswork2 <chr>,
## # hrswork2 <chr>, uhrswork <chr>, incwage <dbl>
usa77 <-usa77%>%
mutate(codeduc=with(usa77, match(educ, unique(educ))))
usa77%>%
count(empstat)
## # A tibble: 3 × 2
## empstat n
## <chr> <int>
## 1 employed 698967
## 2 not in labor force 209731
## 3 unemployed 36485
Creating education categories using the numerical values in
educnum
usa77<-usa77%>%
mutate(edcat =case_when(codeduc== 1 |codeduc== 3 |codeduc== 4 |codeduc== 5 | codeduc==10 |codeduc== 12 ~"Less than HS",
codeduc ==2 ~ "High School",
codeduc ==7 |codeduc==8 |codeduc==11 ~"Some College",
codeduc ==6~ "Bachelors",
codeduc ==9~"Graduate",
TRUE~"N/A"),
inlf=ifelse(empstat!="not in labor force",1,0))
usa77%>%
count(edcat, educ, edcat, empstat, inlf)
## # A tibble: 36 × 5
## edcat educ empstat inlf n
## <chr> <chr> <chr> <dbl> <int>
## 1 Bachelors 4 years of college employed 1 122825
## 2 Bachelors 4 years of college not in labor force 0 20782
## 3 Bachelors 4 years of college unemployed 1 3744
## 4 Graduate 5+ years of college employed 1 79768
## 5 Graduate 5+ years of college not in labor force 0 8927
## 6 Graduate 5+ years of college unemployed 1 1705
## 7 High School grade 12 employed 1 241645
## 8 High School grade 12 not in labor force 0 83464
## 9 High School grade 12 unemployed 1 14991
## 10 Less than HS grade 10 employed 1 20341
## # ℹ 26 more rows
Including weights
sex_inlf<- function(data, gender){
data %>%
filter(sex=={{gender}})%>%
group_by(year, edcat)%>%
summarize(mean_inlf = sum(inlf * perwt) / sum(perwt), .groups = 'drop')%>% # weighted mean of inlf
pivot_wider(names_from = edcat,
values_from = mean_inlf)
}
sex_inlf(usa77, "male") # male
## # A tibble: 6 × 6
## year Bachelors Graduate `High School` `Less than HS` `Some College`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1970 0.967 0.963 0.956 0.887 0.944
## 2 1980 0.960 0.953 0.925 0.822 0.929
## 3 1990 0.952 0.956 0.889 0.768 0.923
## 4 2000 0.924 0.924 0.861 0.774 0.897
## 5 2010 0.931 0.949 0.820 0.712 0.879
## 6 2017 0.929 0.950 0.811 0.711 0.870
sex_inlf(usa77,"female") # female
## # A tibble: 6 × 6
## year Bachelors Graduate `High School` `Less than HS` `Some College`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1970 0.551 0.750 0.503 0.447 0.513
## 2 1980 0.704 0.812 0.611 0.475 0.673
## 3 1990 0.814 0.869 0.694 0.502 0.779
## 4 2000 0.788 0.836 0.710 0.509 0.771
## 5 2010 0.816 0.869 0.711 0.519 0.785
## 6 2017 0.823 0.867 0.687 0.482 0.777
A Small Summary stats Table
usa77<-usa77%>%
mutate(LTHS=ifelse(edcat=="Less than HS",1,0),
HS =ifelse(edcat =="High School",1,0),
SC =ifelse(edcat=="Some College",1,0 ),
BA =ifelse(edcat =="Bachelors", 1, 0),
GRAD =ifelse(edcat =="Graduate", 1, 0)
)
summary tables
nv_sum(usa77, LTHS, HS, SC, BA, GRAD, inlf, weight = TRUE)
## # A tibble: 6 × 7
## variable Obs min mean median St.dev max
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 LTHS 945183 0 0.165 0 0.371 1
## 2 HS 945183 0 0.357 0 0.479 1
## 3 SC 945183 0 0.220 0 0.415 1
## 4 BA 945183 0 0.162 0 0.368 1
## 5 GRAD 945183 0 0.0950 0 0.293 1
## 6 inlf 945183 0 0.784 1 0.411 1
nv_sum(usa77, LTHS, HS, SC, BA, GRAD, inlf, weight = TRUE, condition = quo(year==1970))
## # A tibble: 6 × 7
## variable Obs min mean median St.dev max
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 LTHS 140910 0 0.403 0 0.491 1
## 2 HS 140910 0 0.361 0 0.480 1
## 3 SC 140910 0 0.114 0 0.318 1
## 4 BA 140910 0 0.0685 0 0.253 1
## 5 GRAD 140910 0 0.0530 0 0.224 1
## 6 inlf 140910 0 0.703 1 0.457 1
nv_sum(usa77, LTHS, HS, SC, BA, GRAD, inlf, weight = TRUE, condition = quo(year==2017))
## # A tibble: 6 × 7
## variable Obs min mean median St.dev max
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 LTHS 146525 0 0.0923 0 0.289 1
## 2 HS 146525 0 0.333 0 0.471 1
## 3 SC 146525 0 0.239 0 0.427 1
## 4 BA 146525 0 0.213 0 0.409 1
## 5 GRAD 146525 0 0.123 0 0.328 1
## 6 inlf 146525 0 0.800 1 0.400 1
Making summary tables by grouping
nv_sum(usa77%>%group_by(year), LTHS, HS, SC, BA, GRAD, inlf, weight = TRUE) %>%
dplyr::select(year, variable, mean)%>%
pivot_wider(names_from =variable ,
values_from = mean)
## # A tibble: 6 × 7
## year LTHS HS SC BA GRAD inlf
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1970 0.403 0.361 0.114 0.0685 0.0530 0.703
## 2 1980 0.253 0.381 0.178 0.0985 0.0890 0.753
## 3 1990 0.148 0.342 0.279 0.150 0.0809 0.806
## 4 2000 0.113 0.391 0.219 0.181 0.0956 0.800
## 5 2010 0.111 0.346 0.243 0.196 0.104 0.801
## 6 2017 0.0923 0.333 0.239 0.213 0.123 0.800
nv_sum(usa77%>%
group_by(year, sex),
LTHS, HS, SC, BA, GRAD, inlf, weight = TRUE) %>%
dplyr::select(year, sex, variable, mean)%>%
pivot_wider(names_from = variable,
values_from = mean)
## # A tibble: 12 × 8
## year sex LTHS HS SC BA GRAD inlf
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1970 female 0.398 0.400 0.111 0.0605 0.0302 0.492
## 2 1970 male 0.409 0.319 0.118 0.0771 0.0773 0.928
## 3 1980 female 0.255 0.421 0.174 0.0859 0.0628 0.608
## 4 1980 male 0.250 0.339 0.183 0.112 0.117 0.907
## 5 1990 female 0.144 0.363 0.285 0.141 0.0666 0.719
## 6 1990 male 0.153 0.320 0.273 0.158 0.0955 0.896
## 7 2000 female 0.107 0.396 0.227 0.181 0.0884 0.727
## 8 2000 male 0.119 0.386 0.211 0.181 0.103 0.876
## 9 2010 female 0.0996 0.329 0.257 0.206 0.108 0.749
## 10 2010 male 0.123 0.365 0.227 0.185 0.100 0.853
## 11 2017 female 0.0810 0.305 0.252 0.226 0.135 0.748
## 12 2017 male 0.104 0.361 0.226 0.200 0.110 0.853
Create new Variables
usa77$uhrswork<-as.numeric(usa77$uhrswork)
## Warning: NAs introduced by coercion
usa77$edcat_num<-as.numeric(factor(usa77$edcat, levels = c("Less than HS", "High School", "Some College", "Bachelors", "Graduate"))) # creating edcat_num, based on edcat
usa77$edcat_num<-as.factor(usa77$edcat_num) # converting it to factor
ed<-as.data.frame(model.matrix(~edcat_num-1, data = usa77))
colnames(ed)<-paste0("ed", seq_along(colnames(ed)))
usa77_2<-cbind(usa77, ed)
usa77_2<-usa77_2%>%
mutate(codehrs=with(usa77, match(hrswork2, unique(hrswork2))),
hrover40_2 =ifelse(hrswork2=="40 hours" |hrswork2=="41-48 hours"|hrswork2=="49-59 hours"|hrswork2=="60+ hours",1,0),
wkover40=ifelse(wkswork2=="40-47 weeks"| wkswork2=="48-49 weeks"|wkswork2=="50-52 weeks",1,0),
fulltime_year =case_when(uhrswork>=40 & wkover40==1~ "1",
hrover40_2==1 & year=="1970" & wkover40==1~ "1",
hrover40_2==0 & year=="1970" & wkover40==1~ "0",
TRUE~"0"),
ln_wage =ifelse(incwage>0, log(incwage),NA),
male =ifelse(sex=="male",1,0),
age2 = age^2
)
#usa77_2%>%
# count( edcat_num, ln_wage)
try loops
for(i in 1:10){
y<-i*2
print(y)
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10
## [1] 12
## [1] 14
## [1] 16
## [1] 18
## [1] 20
# alternatives of the loops: apply family functions:
# apply(), lapply(), tapply(), sapply(), vapply(), and mapply()
y<-lapply(1:10, function(x){
x*2
})
y
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 6
##
## [[4]]
## [1] 8
##
## [[5]]
## [1] 10
##
## [[6]]
## [1] 12
##
## [[7]]
## [1] 14
##
## [[8]]
## [1] 16
##
## [[9]]
## [1] 18
##
## [[10]]
## [1] 20
# to get results as a list
y<-sapply(1:10, function(x){
x*2
})
y
## [1] 2 4 6 8 10 12 14 16 18 20
regression results
list_year<-c(1970, 1980, 1990, 2000, 2010, 2017)
for(i in list_year)(
print(i)
)
## [1] 1970
## [1] 1980
## [1] 1990
## [1] 2000
## [1] 2010
## [1] 2017
list_year<-c(1970, 1980, 1990, 2000, 2010, 2017)
#ed_list<-c("ed2", "ed3", "ed4","ed5")
results<-list()
results2<-list()
graph<-list()
graph2<-list()
for(i in list_year){
m<-lm(ln_wage~male, subset = (fulltime_year ==1 & year==i), weights = perwt, data = usa77_2)
rob_se <- coeftest(m, vcov = vcovHC(m, type = "HC1")) #extract st.error
results[[as.character(i)]] <- list(model = m, robust_se = rob_se) #put the models and their robust St.Errors in the list results
graph[[as.character(i)]]<-m # extract coefficient to plot later
m2<-lm(ln_wage~male+ed2+ed3+ed4+ed5+age+age2, subset = (fulltime_year==1 & year ==i), weights = perwt, data = usa77_2)
rob_se2 <- coeftest(m2, vcov = vcovHC(m2, type = "HC1")) #extract st.error
results2[[as.character(i)]]<-list(model=m2, robust_se = rob_se2)
graph2[[as.character(i)]]<-m2 # extract coefficient to plot later
}
all_models <- c(lapply(results, function(x) x$model), lapply(results2, function(x) x$model)) #combine the models from the model results using x$models (model =key used to store models in the list)
all_se <- c(lapply(results, function(x) x$robust_se[, "Std. Error"]), lapply(results2, function(x) x$robust_se[, "Std. Error"])) #combine rob_se from the model results using x$robust_se (robust_se =key used to store the rob_se in the list)
stargazer::stargazer(all_models,
se=all_se,
type = "text",
out = "table1.text",
title = "Wages",
column.labels = rep(as.character(list_year),2), #duplicate labels
notes.append = TRUE)
##
## Wages
## ==================================================================================================================================================================================================================================================================================================================================================================================
## Dependent variable:
## --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## ln_wage
## 1970 1980 1990 2000 2010 2017 1970 1980 1990 2000 2010 2017
## (1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12)
## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## male 0.603*** 0.501*** 0.382*** 0.299*** 0.228*** 0.198*** 0.589*** 0.477*** 0.374*** 0.315*** 0.278*** 0.279***
## (0.006) (0.005) (0.004) (0.005) (0.006) (0.006) (0.005) (0.005) (0.004) (0.004) (0.005) (0.005)
##
## ed2 0.254*** 0.260*** 0.266*** 0.300*** 0.339*** 0.299***
## (0.006) (0.007) (0.007) (0.009) (0.010) (0.012)
##
## ed3 0.389*** 0.374*** 0.451*** 0.489*** 0.548*** 0.491***
## (0.008) (0.008) (0.007) (0.009) (0.011) (0.012)
##
## ed4 0.632*** 0.569*** 0.713*** 0.801*** 0.901*** 0.891***
## (0.009) (0.009) (0.008) (0.010) (0.011) (0.012)
##
## ed5 0.644*** 0.632*** 0.898*** 1.015*** 1.139*** 1.133***
## (0.011) (0.009) (0.010) (0.011) (0.012) (0.013)
##
## age 0.048*** 0.060*** 0.060*** 0.057*** 0.073*** 0.067***
## (0.002) (0.002) (0.002) (0.002) (0.002) (0.002)
##
## age2 -0.001*** -0.001*** -0.001*** -0.001*** -0.001*** -0.001***
## (0.00002) (0.00002) (0.00002) (0.00003) (0.00003) (0.00003)
##
## Constant 8.412*** 9.203*** 9.853*** 10.240*** 10.561*** 10.696*** 7.122*** 7.542*** 8.031*** 8.391*** 8.203*** 8.396***
## (0.005) (0.004) (0.003) (0.004) (0.004) (0.005) (0.042) (0.040) (0.036) (0.041) (0.048) (0.050)
##
## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Observations 59,417 73,491 95,194 107,582 75,684 82,439 59,417 73,491 95,194 107,582 75,684 82,439
## R2 0.159 0.116 0.083 0.045 0.027 0.018 0.268 0.215 0.266 0.248 0.282 0.278
## Adjusted R2 0.159 0.116 0.083 0.045 0.027 0.018 0.268 0.215 0.266 0.248 0.282 0.278
## Residual Std. Error 14.714 (df = 59415) 16.691 (df = 73489) 16.051 (df = 95179) 18.185 (df = 107580) 21.803 (df = 75682) 23.365 (df = 82437) 13.733 (df = 59409) 15.728 (df = 73483) 14.358 (df = 95173) 16.140 (df = 107574) 18.724 (df = 75676) 20.038 (df = 82431)
## F Statistic 11,251.820*** (df = 1; 59415) 9,629.924*** (df = 1; 73489) 8,571.346*** (df = 1; 95179) 5,124.615*** (df = 1; 107580) 2,070.375*** (df = 1; 75682) 1,544.091*** (df = 1; 82437) 3,101.840*** (df = 7; 59409) 2,875.181*** (df = 7; 73483) 4,928.811*** (df = 7; 95173) 5,073.222*** (df = 7; 107574) 4,250.480*** (df = 7; 75676) 4,536.482*** (df = 7; 82431)
## ==================================================================================================================================================================================================================================================================================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
ploting coefficients
# 1st way: Using dwplot function
dwplot(graph)+
labs(
title="Evolution of the US Gender Gap",
x="Coefficients"
)

# 2nd method: using ggplot, which allows including more feature and chart manipulation
coef_data <- bind_rows(
lapply(names(graph2), function(year) {
tidy(graph2[[year]], conf.int = TRUE) %>%
filter(term == "male") %>%
mutate(model = paste0("Model_", year), # Add a model identifier
year = year)
})
)
ggplot(coef_data,
aes( x = year,y = estimate, color = year)) +
geom_point(size =1)+
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width=0.2)+
theme_minimal() +
labs(title = "Evolution of the US Gender Gap",
x= "Male",
y = "Coefficient Estimate")

creating race dummies
usa77_2%>%
group_by(hispan)%>%
summarise(mean = mean(ln_wage, na.rm = TRUE))
## # A tibble: 5 × 2
## hispan mean
## <chr> <dbl>
## 1 cuban 9.78
## 2 mexican 9.70
## 3 not hispanic 9.77
## 4 other 9.83
## 5 puerto rican 9.77
usa77_2<-usa77_2%>%
mutate(white_nh=ifelse(race=="white" & hispan=="not hispanic", 1,0),
black_nh=ifelse(race=="black/african american/negro" & hispan =="not hispanic",1, 0),
hispan_nh=ifelse(hispan!="not hispanic", 1, 0),
other_nh=ifelse(race!="white" & race !="black/african american/negro" & hispan =="not hispanic",1,0))
#check the proportion of each group
nv_sum(usa77_2, white_nh, black_nh, hispan_nh, other_nh, weight = FALSE)
## variable Obs min mean median st.dev max
## 1 white_nh 945183 0 0.76344898 1 0.4249645 1
## 2 black_nh 945183 0 0.09663525 0 0.2954606 1
## 3 hispan_nh 945183 0 0.08921870 0 0.2850593 1
## 4 other_nh 945183 0 0.05069706 0 0.2193785 1
usa77_2<-usa77_2%>%
mutate(racecat=case_when(white_nh==1~"white_nh",
black_nh==1~"black_nh",
other_nh==1~"other_nh",
hispan_nh==1~"hispanic"),
college =ifelse(edcat=="Bachelors" | edcat=="Graduate", 1,0))
usa77_2%>%
count(edcat)
usa77_2%>%
dplyr::select(college ,edcat)
Ploting predicted lines for each race category
mod<-lm(ln_wage~college*racecat, subset = (year==2017), data=usa77_2)
summary(mod)
##
## Call:
## lm(formula = ln_wage ~ college * racecat, data = usa77_2, subset = (year ==
## 2017))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5749 -0.3772 0.1362 0.5967 3.3217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.97464 0.01182 844.091 < 2e-16 ***
## college 0.72292 0.02255 32.066 < 2e-16 ***
## racecathispanic 0.14085 0.01482 9.503 < 2e-16 ***
## racecatother_nh 0.14199 0.01894 7.496 6.64e-14 ***
## racecatwhite_nh 0.35482 0.01277 27.788 < 2e-16 ***
## college:racecathispanic -0.12543 0.02969 -4.224 2.40e-05 ***
## college:racecatother_nh 0.11193 0.03012 3.716 0.000202 ***
## college:racecatwhite_nh -0.09119 0.02372 -3.844 0.000121 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.005 on 111941 degrees of freedom
## (34576 observations deleted due to missingness)
## Multiple R-squared: 0.1111, Adjusted R-squared: 0.111
## F-statistic: 1998 on 7 and 111941 DF, p-value: < 2.2e-16
margins_mod <- margins(mod, variables = "college", at = list(racecat = unique(usa77_2$racecat)))
margins_df <- as.data.frame(margins_mod)
ggplot(margins_df, aes(x = factor(college), y = fitted, color = factor(racecat))) +
geom_point() +
#geom_line(aes(group = racecat)) +
labs(x = "college", y = "Linear Prediction",
color = "Treated",
title = "Predicted Ln(wages) by BA status and Race",
subtitle = "US ACS 2017") +
theme_minimal()
