Important packages (libraries) for this lab:
- tidyverse
- haven
- stargazer
- gt: for saving summary tables # use the data from stata
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(haven) #to help merge
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
if(!require("gt")) install.packages("gt")
## Loading required package: gt
library(gt)
rm(list = ls())
Load data
usa80_lab4<-read_csv("Ipums_USA_census_1980.csv")
## Rows: 148362 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): sample, gq, sex, race, raced, hispan, hispand, educ, educd, wkswor...
## dbl (18): year, serial, hhwt, cluster, strata, pernum, perwt, age, incwage, ...
## lgl (1): cbserial
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cpi<- read_csv("cpi_inflation_adju.csv")
## Rows: 76 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): year, cpi, cpi_2021_factor, cpi_adjust_to_2021
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa21_lab4<-read_csv("IPUMS_USA_ACS_2021_Lab_4.csv")
## Rows: 144101 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): sample, gq, sex, race, raced, hispan, hispand, educ, educd, wkswor...
## dbl (13): year, serial, cbserial, hhwt, cluster, strata, pernum, perwt, age,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# load the nv.R function package
source("/Volumes/middfiles/Classes/Fall23/ECON0211A/Noe_Labs/NV.R")
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
2. Append data
usa_earnings<-bind_rows(usa21_lab4, usa80_lab4)%>%
dplyr::select(-cpi, -cpi_2021_factor, -cpi_adjust_to_2021, -real_earn, -ln_earn, -college) # remove these variables
Tab sex year, sum(earning)
tab1<-usa_earnings%>%
group_by(year, sex)%>%
summarise(mean_earning=mean(earnings,na.rm=TRUE))%>%
pivot_wider(names_from = year,
values_from = mean_earning)%>%
gt()%>%
tab_header(
title = "Average earnings accross years and gender"
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
tab1
Average earnings accross years and gender |
sex |
1980 |
2021 |
female |
8478.621 |
54087.21 |
male |
18070.630 |
76026.61 |
#gtsave(tab1, "aver_earning.html") # save it in html, it can be opened in any format
# or
usa_sex<-usa_earnings%>%
group_by(year, sex)
nv_sum(usa_sex, earnings, weight = FALSE)%>%
gt()%>%
tab_header(
title = "Average earnings accross years and gender")%>%
gtsave( "aver_earning.html") # save it in html, it can be opened in any format
4. Merge usa_earnings with cpi
Earn_infra<-merge(usa_earnings, cpi, by="year", all.x = TRUE)
# generate real_earning
Earn_infra<-Earn_infra%>%
mutate(real_earn = earnings*cpi_adjust_to_2021)
Earn_infra%>%
group_by(year, sex)%>%
summarise(mean_earn =mean(real_earn, na.rm=TRUE))%>%
pivot_wider(names_from = year,
values_from = mean_earn)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 3
## sex `1980` `2021`
## <chr> <dbl> <dbl>
## 1 female 27888. 54087.
## 2 male 59437. 76027.
# or
earn_inf<-Earn_infra%>%
group_by(year, sex)
nv_sum(earn_inf, real_earn, weight = FALSE)
## # A tibble: 4 × 9
## year sex variable Obs min mean median st.dev max
## <dbl> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1980 female real_earn 76307 16.4 27888. 25672. 21195. 246687.
## 2 1980 male real_earn 72055 16.4 59437. 54288. 38438. 246687.
## 3 2021 female real_earn 72395 10 54087. 40000 56813. 787000
## 4 2021 male real_earn 71706 4 76027. 55000 83534. 787000
5. Generate log real_earn
Earn_infra<-Earn_infra%>%
mutate(ln_earn =log(real_earn))
Regressions (fit the models)
# Perform linear regression
m1 <- lm(ln_earn ~ female, data = Earn_infra, subset = (year == 1980))
summary(m1)
##
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (year ==
## 1980))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9460 -0.3446 0.1948 0.5714 2.5656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.746093 0.003881 2768.8 <2e-16 ***
## female -0.895846 0.005873 -152.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9579 on 108143 degrees of freedom
## (40217 observations deleted due to missingness)
## Multiple R-squared: 0.1771, Adjusted R-squared: 0.1771
## F-statistic: 2.327e+04 on 1 and 108143 DF, p-value: < 2.2e-16
m2 <- lm(ln_earn ~ female, data = Earn_infra, subset = (year == 2021))
summary(m2)
##
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (year ==
## 2021))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4077 -0.4362 0.1437 0.6464 3.1230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.793983 0.004619 2337.11 <2e-16 ***
## female -0.341007 0.006645 -51.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.094 on 108522 degrees of freedom
## (35577 observations deleted due to missingness)
## Multiple R-squared: 0.02369, Adjusted R-squared: 0.02368
## F-statistic: 2633 on 1 and 108522 DF, p-value: < 2.2e-16
#create a regression table
6. Over 40 hrs and weeks worked
Earn_infra%>%
group_by(year)%>%
summarise(mean_hr=mean(uhrswork2, na.rm=TRUE),
n=n())
## # A tibble: 2 × 3
## year mean_hr n
## <dbl> <dbl> <int>
## 1 1980 39.9 148362
## 2 2021 40.2 144101
Run regressions taking account weeks and hours worked
m3<-lm(ln_earn~female, data =Earn_infra, subset =(Over40Weeks ==1 & uhrswork2>=40 & year ==1980 ))
summary(m3)
##
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (Over40Weeks ==
## 1 & uhrswork2 >= 40 & year == 1980))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0973 -0.2825 0.0635 0.3582 2.0328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.897364 0.002887 3775.3 <2e-16 ***
## female -0.514335 0.005101 -100.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6476 on 74038 degrees of freedom
## (5279 observations deleted due to missingness)
## Multiple R-squared: 0.1207, Adjusted R-squared: 0.1207
## F-statistic: 1.017e+04 on 1 and 74038 DF, p-value: < 2.2e-16
m4<-lm(ln_earn~female, data =Earn_infra, subset =(Over40Weeks ==1 & uhrswork2>=40 & year ==2021 ))
summary(m4)
##
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (Over40Weeks ==
## 1 & uhrswork2 >= 40 & year == 2021))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6879 -0.4403 -0.0154 0.4387 2.7016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.074211 0.003472 3190.00 <2e-16 ***
## female -0.199827 0.005275 -37.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7337 on 78793 degrees of freedom
## (3455 observations deleted due to missingness)
## Multiple R-squared: 0.01788, Adjusted R-squared: 0.01787
## F-statistic: 1435 on 1 and 78793 DF, p-value: < 2.2e-16
7. Including education
# creating a college variable of at least having a college degree
Earn_infra<-Earn_infra%>%
mutate(college =ifelse(edyears>=16,1,0))
8. Estimating regression, including college
m5<-lm(ln_earn~female+college, data =Earn_infra, subset = (Over40Weeks ==1 & uhrswork2>=40 & year ==1980))
summary(m5)
##
## Call:
## lm(formula = ln_earn ~ female + college, data = Earn_infra, subset = (Over40Weeks ==
## 1 & uhrswork2 >= 40 & year == 1980))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3729 -0.2613 0.0676 0.3471 2.0968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.811188 0.003111 3474.74 <2e-16 ***
## female -0.492089 0.004976 -98.89 <2e-16 ***
## college 0.361812 0.005618 64.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6302 on 74037 degrees of freedom
## (5279 observations deleted due to missingness)
## Multiple R-squared: 0.1674, Adjusted R-squared: 0.1674
## F-statistic: 7442 on 2 and 74037 DF, p-value: < 2.2e-16
m6<-lm(ln_earn~female+college, data =Earn_infra, subset = (Over40Weeks ==1 & uhrswork2>=40 & year ==2021))
summary(m6)
##
## Call:
## lm(formula = ln_earn ~ female + college, data = Earn_infra, subset = (Over40Weeks ==
## 1 & uhrswork2 >= 40 & year == 2021))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0362 -0.3574 0.0028 0.3813 2.8064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.816933 0.003767 2871.21 <2e-16 ***
## female -0.263497 0.004839 -54.46 <2e-16 ***
## college 0.605604 0.004804 126.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6693 on 78792 degrees of freedom
## (3455 observations deleted due to missingness)
## Multiple R-squared: 0.1827, Adjusted R-squared: 0.1827
## F-statistic: 8809 on 2 and 78792 DF, p-value: < 2.2e-16
9 & 10. Combining all the regressions using stargazer
function
stargazer( m1, m2, m3, m4,m5, m6,
type = "text",
title = "Table 3. Log earnings on Female for years 1980 and 2021",
dep.var.caption = "DV: earings in year 1980 and 2021",
out = "table3.txt",
column.labels = c("80b","21b","80tb","21tb","80y","21y"),
notes = "Significance level"
)
##
## Table 3. Log earnings on Female for years 1980 and 2021
## =====================================================================================================================================================================================================
## DV: earings in year 1980 and 2021
## ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## ln_earn
## 80b 21b 80tb 21tb 80y 21y
## (1) (2) (3) (4) (5) (6)
## -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## female -0.896*** -0.341*** -0.514*** -0.200*** -0.492*** -0.263***
## (0.006) (0.007) (0.005) (0.005) (0.005) (0.005)
##
## college 0.362*** 0.606***
## (0.006) (0.005)
##
## Constant 10.746*** 10.794*** 10.897*** 11.074*** 10.811*** 10.817***
## (0.004) (0.005) (0.003) (0.003) (0.003) (0.004)
##
## -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Observations 108,145 108,524 74,040 78,795 74,040 78,795
## R2 0.177 0.024 0.121 0.018 0.167 0.183
## Adjusted R2 0.177 0.024 0.121 0.018 0.167 0.183
## Residual Std. Error 0.958 (df = 108143) 1.094 (df = 108522) 0.648 (df = 74038) 0.734 (df = 78793) 0.630 (df = 74037) 0.669 (df = 78792)
## F Statistic 23,269.960*** (df = 1; 108143) 2,633.494*** (df = 1; 108522) 10,166.630*** (df = 1; 74038) 1,434.864*** (df = 1; 78793) 7,441.704*** (df = 2; 74037) 8,809.481*** (df = 2; 78792)
## =====================================================================================================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## Significance level
estimating and testing correlations
cor(Earn_infra$female,Earn_infra$college, method = "pearson")
## [1] -0.006836197
cor.test(Earn_infra$female,Earn_infra$college, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: Earn_infra$female and Earn_infra$college
## t = -3.6971, df = 292461, p-value = 0.0002181
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01046014 -0.00321207
## sample estimates:
## cor
## -0.006836197
11. Creating race dummies
Earn_infra%>%
group_by(race)%>%
summarise(mean = mean(real_earn))
## # A tibble: 9 × 2
## race mean
## <chr> <dbl>
## 1 american indian or alaska native NA
## 2 black/african american NA
## 3 chinese NA
## 4 japanese NA
## 5 other asian or pacific islander NA
## 6 other race, nec NA
## 7 three or more major races NA
## 8 two major races NA
## 9 white NA
Earn_infra<-Earn_infra%>%
mutate(white_nh=ifelse(race=="white" & hispan=="not hispanic", 1,0),
black_nh=ifelse(race=="black/african american" & hispan =="not hispanic",1, 0),
hispan_nh=ifelse(hispan!="not hispanic", 1, 0),
other_nh=ifelse(race!="white" & race !="black/african american" & hispan =="not hispanic",1,0))
nv_sum(Earn_infra, white_nh, black_nh, hispan_nh, other_nh, weight = FALSE)
## variable Obs min mean median st.dev max
## 1 white_nh 292463 0 0.71664792 1 0.4506266 1
## 2 black_nh 292463 0 0.09883643 0 0.2984428 1
## 3 hispan_nh 292463 0 0.10959677 0 0.3123870 1
## 4 other_nh 292463 0 0.07491888 0 0.2632609 1