#Note: Install all the necessary packages before loading the libraries if they are not installed already ## Important packages (libraries) for this lab:

  1. tidyverse
  2. cowplot
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(cowplot) # to put graphs together

rm(list = ls()) #clear the environment

1. loading data

nv_sum(usa21_inc, incwage, edyears, earnings, weight = FALSE)
## # A tibble: 3 × 7
##   variable    Obs   min    mean median   st.dev    max
##   <chr>     <int> <dbl>   <dbl>  <dbl>    <dbl>  <dbl>
## 1 incwage  144060     0 49306.   34000 69753.   770000
## 2 edyears  144060     0    13.8     13     3.24     20
## 3 earnings 144060     4 65409.   48000 73493.   770000

2. Estimating regression of earning on years

Model1<-lm(earnings~edyears, data = usa21_inc, na.action = na.exclude) # omit N/A's to get predicted values  
summary(Model1)
## 
## Call:
## lm(formula = earnings ~ edyears, data = usa21_inc, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107252  -34696  -14342   13417  713417 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -37553.63    1010.37  -37.17   <2e-16 ***
## edyears       7241.27      69.47  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70070 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16
coef(Model1) # pull out the coefficients
## (Intercept)     edyears 
##  -37553.629    7241.268

3. generating residuals and fitted values

usa21_inc$yhat=fitted(Model1) # generate fitted values from Model1
#this also works: #usa21_inc$yhat<-Model1$fitted.values 

usa21_inc$uhat<- residuals(Model1) # generate residual values.

cor(usa21_inc$edyears,usa21_inc$earnings) # correlation between two variables
## [1] NA
#view the predicted variables
usa21_inc%>%
  na.omit()%>%
  dplyr::select(yhat, uhat) 
## # A tibble: 108,593 × 2
##      yhat    uhat
##     <dbl>   <dbl>
##  1 34859.  -8859.
##  2 56583. -56313.
##  3 49342. -37342.
##  4 13135.   2865.
##  5 56583. -54983.
##  6 56583. -22583.
##  7 49342. -19342.
##  8 56583. -30583.
##  9 34859. -19459.
## 10 49342. -19342.
## # ℹ 108,583 more rows

creating graphs

g1<-ggplot(usa21_inc,aes(x=edyears, y =earnings))+ 
  geom_point()+
  labs(
    title = "Income wage on years of education",
    x="years of education",
    y = " income wage"
  )

# another quick way of plotting is using qplot function: qplot(usa21_inc$edyears,usa21_inc$INCWAGE) +geom_smooth(method = lm, se=FALSE)
g2<-ggplot(usa21_inc,aes(x=edyears, y =earnings))+ # x and y have to be within the ggplot function for the fitted line to work
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(
    title = "Income wage on years of education",
    x="years of education",
    y = " income wage"
  )
 
# plot the residual on fitted values
# the line of best fit is horizontal: there is is not a pattern to this data. So there is a linear relationship.
g3<-ggplot(usa21_inc, aes(x=yhat, y = uhat))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(
    title = "Residual on fitted values",
    x="fitted_values(yhat)",
    y="residuals(uhat)"
  )
plot_grid(g1, g2, g3)
## Warning: Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 35466 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 35466 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).

4. Estimating a regression model of earnings on months of schooling

# creating a new variable month
usa21_inc<-usa21_inc%>%
  mutate(edmonths = edyears*10) # assuming each year has 10 months of schooling

# regress earning on months of education
Model2<-lm(earnings~edmonths, data = usa21_inc, na.action = na.exclude)
summary(Model2)
## 
## Call:
## lm(formula = earnings ~ edmonths, data = usa21_inc, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107252  -34696  -14342   13417  713417 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -37553.629   1010.373  -37.17   <2e-16 ***
## edmonths       724.127      6.947  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70070 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

5. Estimating a regression model of earnings(in cents) on years of education

#create a new variable earn_cents
usa21_inc<-usa21_inc%>%
  mutate(earn_cents=earnings*100)

Model3<-lm(earn_cents~edyears, data=usa21_inc)
summary(Model3)
## 
## Call:
## lm(formula = earn_cents ~ edyears, data = usa21_inc)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -10725172  -3469595  -1434158   1341715  71341715 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3755363     101037  -37.17   <2e-16 ***
## edyears       724127       6947  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7007000 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

6. multiplying by 5

usa21_inc<-usa21_inc%>%
  mutate(earn_5 = earnings*5,
         ed_5 =edyears*5)
Model4<-lm(earn_5~ed_5, data = usa21_inc)
summary(Model4)
## 
## Call:
## lm(formula = earn_5 ~ ed_5, data = usa21_inc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -536259 -173480  -71708   67086 3567086 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -187768.15    5051.86  -37.17   <2e-16 ***
## ed_5           7241.27      69.47  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 350400 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

7. data transformation using log

usa21_inc<-usa21_inc%>%
  mutate(ln_earnings=log(earnings))

8. Estimating a fitted model of log of earnings on years of education.

Model5<-lm(ln_earnings~edyears, data = usa21_inc)
summary(Model5)
## 
## Call:
## lm(formula = ln_earnings ~ edyears, data = usa21_inc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4380 -0.3612  0.1552  0.6057  4.3171 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.042662   0.015213   594.4   <2e-16 ***
## edyears     0.111352   0.001046   106.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.055 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.0945, Adjusted R-squared:  0.09449 
## F-statistic: 1.133e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

9. Dummy variable as explainatory variable

# creating a dummy variable for being female and summarize earnings:
usa21_inc%>%
  count(sex)
## # A tibble: 2 × 2
##   sex        n
##   <chr>  <int>
## 1 female 72045
## 2 male   72015
usa21_inc<-usa21_inc%>%
  mutate(female =ifelse(sex=="female", 1,0))
usa21_inc%>%
  summarise(mean_female=mean(female))
## # A tibble: 1 × 1
##   mean_female
##         <dbl>
## 1       0.500
# summarizing using nv_sum()
female_usa21_inc<-usa21_inc%>%
 group_by(female)
nv_sum(female_usa21_inc, earnings, weight = FALSE)
## # A tibble: 2 × 8
##   female variable   Obs   min   mean median st.dev    max
##    <dbl> <chr>    <int> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1      0 earnings 72015     4 76289.  55000 84401. 770000
## 2      1 earnings 72045     4 53621.  40000 57166. 682000
"
#This is the regular summary without using nv_sum function
usa21_inc%>%
  group_by(female)%>%
  summarise(mean_wage =mean(earnings),
            total=n())
"
## [1] "\n#This is the regular summary without using nv_sum function\nusa21_inc%>%\n  group_by(female)%>%\n  summarise(mean_wage =mean(earnings),\n            total=n())\n"
# Regress earnings on the female
Model6<-lm(earnings~female, data = usa21_inc)
summary(Model6)
## 
## Call:
## lm(formula = earnings ~ female, data = usa21_inc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -76285 -37621 -16289  13711 693711 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  76289.0      305.6  249.66   <2e-16 ***
## female      -22667.8      441.1  -51.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72620 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.02375,    Adjusted R-squared:  0.02374 
## F-statistic:  2641 on 1 and 108592 DF,  p-value: < 2.2e-16
# creating a dummy variable male
usa21_inc<-usa21_inc%>%
  mutate(male =ifelse(sex =="male",1, 0))

#summarize earnings
male_usa21_inc<-usa21_inc%>%
  group_by(male)
nv_sum(male_usa21_inc, earnings, weight = FALSE)
## # A tibble: 2 × 8
##    male variable   Obs   min   mean median st.dev    max
##   <dbl> <chr>    <int> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     0 earnings 72045     4 53621.  40000 57166. 682000
## 2     1 earnings 72015     4 76289.  55000 84401. 770000
# Regress earnings on the male
Model7<-lm(earnings~male, data = usa21_inc)
summary(Model7)
## 
## Call:
## lm(formula = earnings ~ male, data = usa21_inc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -76285 -37621 -16289  13711 693711 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  53621.3      318.1  168.59   <2e-16 ***
## male         22667.8      441.1   51.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72620 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.02375,    Adjusted R-squared:  0.02374 
## F-statistic:  2641 on 1 and 108592 DF,  p-value: < 2.2e-16
# regress ln_earnings on female
Model8<-lm(ln_earnings~female, data = usa21_inc)
summary(Model8)
## 
## Call:
## lm(formula = ln_earnings ~ female, data = usa21_inc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4141 -0.4381  0.1597  0.6452  2.9959 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.800438   0.004603 2346.54   <2e-16 ***
## female      -0.363524   0.006644  -54.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.094 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.02683,    Adjusted R-squared:  0.02682 
## F-statistic:  2994 on 1 and 108592 DF,  p-value: < 2.2e-16