Important packages (libraries) for this lab:

  1. tidyverse
  2. haven
  3. stargazer
  4. gt: for saving summary tables # use the data from stata
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(haven) #to help merge
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
if(!require("gt")) install.packages("gt")
## Loading required package: gt
library(gt)

rm(list = ls())

Load data

usa80_lab4<-read_csv("Ipums_USA_census_1980.csv")
## Rows: 148362 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): sample, gq, sex, race, raced, hispan, hispand, educ, educd, wkswor...
## dbl (18): year, serial, hhwt, cluster, strata, pernum, perwt, age, incwage, ...
## lgl  (1): cbserial
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cpi<- read_csv("cpi_inflation_adju.csv")
## Rows: 76 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): year, cpi, cpi_2021_factor, cpi_adjust_to_2021
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa21_lab4<-read_csv("IPUMS_USA_ACS_2021_Lab_4.csv")
## Rows: 144101 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): sample, gq, sex, race, raced, hispan, hispand, educ, educd, wkswor...
## dbl (13): year, serial, cbserial, hhwt, cluster, strata, pernum, perwt, age,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# load the nv.R function package
source("/Volumes/middfiles/Classes/Fall23/ECON0211A/Noe_Labs/NV.R") 
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

2. Append data

usa_earnings<-bind_rows(usa21_lab4, usa80_lab4)%>%
  dplyr::select(-cpi, -cpi_2021_factor, -cpi_adjust_to_2021, -real_earn, -ln_earn, -college) # remove these variables

Tab sex year, sum(earning)

tab1<-usa_earnings%>%
  group_by(year, sex)%>%
  summarise(mean_earning=mean(earnings,na.rm=TRUE))%>%
  pivot_wider(names_from = year,
              values_from = mean_earning)%>%
  gt()%>%
  tab_header(
    title = "Average earnings accross years and gender"
  )
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
tab1
Average earnings accross years and gender
sex 1980 2021
female 8478.621 54087.21
male 18070.630 76026.61
#gtsave(tab1, "aver_earning.html") # save it in html, it can be opened in any format

# or
usa_sex<-usa_earnings%>%
  group_by(year, sex)
nv_sum(usa_sex, earnings, weight = FALSE)%>%
  gt()%>%
  tab_header(
    title = "Average earnings accross years and gender")%>%
  gtsave( "aver_earning.html") # save it in html, it can be opened in any format

4. Merge usa_earnings with cpi

Earn_infra<-merge(usa_earnings, cpi, by="year", all.x = TRUE)

# generate real_earning
Earn_infra<-Earn_infra%>%
  mutate(real_earn = earnings*cpi_adjust_to_2021)

Earn_infra%>%
  group_by(year, sex)%>%
  summarise(mean_earn =mean(real_earn, na.rm=TRUE))%>%
  pivot_wider(names_from = year,
              values_from = mean_earn)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 3
##   sex    `1980` `2021`
##   <chr>   <dbl>  <dbl>
## 1 female 27888. 54087.
## 2 male   59437. 76027.
# or
earn_inf<-Earn_infra%>%
  group_by(year, sex)
nv_sum(earn_inf, real_earn, weight = FALSE)
## # A tibble: 4 × 9
##    year sex    variable    Obs   min   mean median st.dev     max
##   <dbl> <chr>  <chr>     <int> <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
## 1  1980 female real_earn 76307  16.4 27888. 25672. 21195. 246687.
## 2  1980 male   real_earn 72055  16.4 59437. 54288. 38438. 246687.
## 3  2021 female real_earn 72395  10   54087. 40000  56813. 787000 
## 4  2021 male   real_earn 71706   4   76027. 55000  83534. 787000

5. Generate log real_earn

Earn_infra<-Earn_infra%>%
  mutate(ln_earn =log(real_earn))

Regressions (fit the models)

# Perform linear regression
m1 <- lm(ln_earn ~ female, data = Earn_infra, subset = (year == 1980))
summary(m1)
## 
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (year == 
##     1980))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9460 -0.3446  0.1948  0.5714  2.5656 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.746093   0.003881  2768.8   <2e-16 ***
## female      -0.895846   0.005873  -152.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9579 on 108143 degrees of freedom
##   (40217 observations deleted due to missingness)
## Multiple R-squared:  0.1771, Adjusted R-squared:  0.1771 
## F-statistic: 2.327e+04 on 1 and 108143 DF,  p-value: < 2.2e-16
m2 <- lm(ln_earn ~ female, data = Earn_infra, subset = (year == 2021))
summary(m2)
## 
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (year == 
##     2021))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4077 -0.4362  0.1437  0.6464  3.1230 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.793983   0.004619 2337.11   <2e-16 ***
## female      -0.341007   0.006645  -51.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.094 on 108522 degrees of freedom
##   (35577 observations deleted due to missingness)
## Multiple R-squared:  0.02369,    Adjusted R-squared:  0.02368 
## F-statistic:  2633 on 1 and 108522 DF,  p-value: < 2.2e-16
#create a regression table

6. Over 40 hrs and weeks worked

Earn_infra%>%
  group_by(year)%>%
  summarise(mean_hr=mean(uhrswork2, na.rm=TRUE),
            n=n())
## # A tibble: 2 × 3
##    year mean_hr      n
##   <dbl>   <dbl>  <int>
## 1  1980    39.9 148362
## 2  2021    40.2 144101

Run regressions taking account weeks and hours worked

m3<-lm(ln_earn~female, data =Earn_infra, subset =(Over40Weeks ==1 & uhrswork2>=40 & year ==1980 ))
summary(m3)
## 
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (Over40Weeks == 
##     1 & uhrswork2 >= 40 & year == 1980))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0973 -0.2825  0.0635  0.3582  2.0328 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.897364   0.002887  3775.3   <2e-16 ***
## female      -0.514335   0.005101  -100.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6476 on 74038 degrees of freedom
##   (5279 observations deleted due to missingness)
## Multiple R-squared:  0.1207, Adjusted R-squared:  0.1207 
## F-statistic: 1.017e+04 on 1 and 74038 DF,  p-value: < 2.2e-16
m4<-lm(ln_earn~female, data =Earn_infra, subset =(Over40Weeks ==1 & uhrswork2>=40 & year ==2021 ))
summary(m4)
## 
## Call:
## lm(formula = ln_earn ~ female, data = Earn_infra, subset = (Over40Weeks == 
##     1 & uhrswork2 >= 40 & year == 2021))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6879 -0.4403 -0.0154  0.4387  2.7016 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.074211   0.003472 3190.00   <2e-16 ***
## female      -0.199827   0.005275  -37.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7337 on 78793 degrees of freedom
##   (3455 observations deleted due to missingness)
## Multiple R-squared:  0.01788,    Adjusted R-squared:  0.01787 
## F-statistic:  1435 on 1 and 78793 DF,  p-value: < 2.2e-16

7. Including education

# creating a college variable of at least having a college degree
Earn_infra<-Earn_infra%>%
  mutate(college =ifelse(edyears>=16,1,0))

8. Estimating regression, including college

m5<-lm(ln_earn~female+college, data =Earn_infra, subset = (Over40Weeks ==1 & uhrswork2>=40 & year ==1980))
summary(m5)
## 
## Call:
## lm(formula = ln_earn ~ female + college, data = Earn_infra, subset = (Over40Weeks == 
##     1 & uhrswork2 >= 40 & year == 1980))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3729 -0.2613  0.0676  0.3471  2.0968 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.811188   0.003111 3474.74   <2e-16 ***
## female      -0.492089   0.004976  -98.89   <2e-16 ***
## college      0.361812   0.005618   64.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6302 on 74037 degrees of freedom
##   (5279 observations deleted due to missingness)
## Multiple R-squared:  0.1674, Adjusted R-squared:  0.1674 
## F-statistic:  7442 on 2 and 74037 DF,  p-value: < 2.2e-16
m6<-lm(ln_earn~female+college, data =Earn_infra, subset = (Over40Weeks ==1 & uhrswork2>=40 & year ==2021))
summary(m6)
## 
## Call:
## lm(formula = ln_earn ~ female + college, data = Earn_infra, subset = (Over40Weeks == 
##     1 & uhrswork2 >= 40 & year == 2021))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0362  -0.3574   0.0028   0.3813   2.8064 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.816933   0.003767 2871.21   <2e-16 ***
## female      -0.263497   0.004839  -54.46   <2e-16 ***
## college      0.605604   0.004804  126.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6693 on 78792 degrees of freedom
##   (3455 observations deleted due to missingness)
## Multiple R-squared:  0.1827, Adjusted R-squared:  0.1827 
## F-statistic:  8809 on 2 and 78792 DF,  p-value: < 2.2e-16

9 & 10. Combining all the regressions using stargazer function

stargazer( m1, m2, m3, m4,m5, m6,
          type = "text",
          title = "Table 3. Log earnings on Female for years 1980 and 2021",
          dep.var.caption = "DV: earings in year 1980 and 2021",
          out = "table3.txt",
          column.labels = c("80b","21b","80tb","21tb","80y","21y"),
          notes = "Significance level"
          )
## 
## Table 3. Log earnings on Female for years 1980 and 2021
## =====================================================================================================================================================================================================
##                                                                                             DV: earings in year 1980 and 2021                                                                        
##                     ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
##                                                                                                          ln_earn                                                                                     
##                                  80b                            21b                          80tb                          21tb                         80y                          21y             
##                                  (1)                            (2)                           (3)                          (4)                          (5)                          (6)             
## -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## female                        -0.896***                      -0.341***                     -0.514***                    -0.200***                    -0.492***                    -0.263***          
##                                (0.006)                        (0.007)                       (0.005)                      (0.005)                      (0.005)                      (0.005)           
##                                                                                                                                                                                                      
## college                                                                                                                                               0.362***                     0.606***          
##                                                                                                                                                       (0.006)                      (0.005)           
##                                                                                                                                                                                                      
## Constant                      10.746***                      10.794***                     10.897***                    11.074***                    10.811***                    10.817***          
##                                (0.004)                        (0.005)                       (0.003)                      (0.003)                      (0.003)                      (0.004)           
##                                                                                                                                                                                                      
## -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Observations                   108,145                        108,524                       74,040                        78,795                       74,040                       78,795           
## R2                              0.177                          0.024                         0.121                        0.018                        0.167                        0.183            
## Adjusted R2                     0.177                          0.024                         0.121                        0.018                        0.167                        0.183            
## Residual Std. Error      0.958 (df = 108143)            1.094 (df = 108522)           0.648 (df = 74038)            0.734 (df = 78793)           0.630 (df = 74037)           0.669 (df = 78792)     
## F Statistic         23,269.960*** (df = 1; 108143) 2,633.494*** (df = 1; 108522) 10,166.630*** (df = 1; 74038) 1,434.864*** (df = 1; 78793) 7,441.704*** (df = 2; 74037) 8,809.481*** (df = 2; 78792)
## =====================================================================================================================================================================================================
## Note:                                                                                                                                                                     *p<0.1; **p<0.05; ***p<0.01
##                                                                                                                                                                                    Significance level

estimating and testing correlations

cor(Earn_infra$female,Earn_infra$college, method = "pearson")
## [1] -0.006836197
cor.test(Earn_infra$female,Earn_infra$college, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Earn_infra$female and Earn_infra$college
## t = -3.6971, df = 292461, p-value = 0.0002181
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01046014 -0.00321207
## sample estimates:
##          cor 
## -0.006836197

11. Creating race dummies

Earn_infra%>%
  group_by(race)%>%
  summarise(mean = mean(real_earn))
## # A tibble: 9 × 2
##   race                              mean
##   <chr>                            <dbl>
## 1 american indian or alaska native    NA
## 2 black/african american              NA
## 3 chinese                             NA
## 4 japanese                            NA
## 5 other asian or pacific islander     NA
## 6 other race, nec                     NA
## 7 three or more major races           NA
## 8 two major races                     NA
## 9 white                               NA
Earn_infra<-Earn_infra%>%
  mutate(white_nh=ifelse(race=="white" & hispan=="not hispanic", 1,0),
         black_nh=ifelse(race=="black/african american" & hispan =="not hispanic",1, 0),
         hispan_nh=ifelse(hispan!="not hispanic", 1, 0),
         other_nh=ifelse(race!="white" & race !="black/african american" & hispan =="not hispanic",1,0))

nv_sum(Earn_infra, white_nh, black_nh, hispan_nh, other_nh, weight = FALSE)  
##    variable    Obs min       mean median    st.dev max
## 1  white_nh 292463   0 0.71664792      1 0.4506266   1
## 2  black_nh 292463   0 0.09883643      0 0.2984428   1
## 3 hispan_nh 292463   0 0.10959677      0 0.3123870   1
## 4  other_nh 292463   0 0.07491888      0 0.2632609   1