#Note: Install all the necessary packages before loading the libraries if they are not installed already ## Important packages (libraries) for this lab:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(cowplot) # to put graphs together
rm(list = ls()) #clear the environment
nv_sum(usa21_inc, incwage, edyears, earnings, weight = FALSE)
## # A tibble: 3 × 7
## variable Obs min mean median st.dev max
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 incwage 144060 0 49306. 34000 69753. 770000
## 2 edyears 144060 0 13.8 13 3.24 20
## 3 earnings 144060 4 65409. 48000 73493. 770000
Model1<-lm(earnings~edyears, data = usa21_inc, na.action = na.exclude) # omit N/A's to get predicted values
summary(Model1)
##
## Call:
## lm(formula = earnings ~ edyears, data = usa21_inc, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107252 -34696 -14342 13417 713417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -37553.63 1010.37 -37.17 <2e-16 ***
## edyears 7241.27 69.47 104.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70070 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.09096, Adjusted R-squared: 0.09095
## F-statistic: 1.087e+04 on 1 and 108592 DF, p-value: < 2.2e-16
coef(Model1) # pull out the coefficients
## (Intercept) edyears
## -37553.629 7241.268
usa21_inc$yhat=fitted(Model1) # generate fitted values from Model1
#this also works: #usa21_inc$yhat<-Model1$fitted.values
usa21_inc$uhat<- residuals(Model1) # generate residual values.
cor(usa21_inc$edyears,usa21_inc$earnings) # correlation between two variables
## [1] NA
#view the predicted variables
usa21_inc%>%
na.omit()%>%
dplyr::select(yhat, uhat)
## # A tibble: 108,593 × 2
## yhat uhat
## <dbl> <dbl>
## 1 34859. -8859.
## 2 56583. -56313.
## 3 49342. -37342.
## 4 13135. 2865.
## 5 56583. -54983.
## 6 56583. -22583.
## 7 49342. -19342.
## 8 56583. -30583.
## 9 34859. -19459.
## 10 49342. -19342.
## # ℹ 108,583 more rows
g1<-ggplot(usa21_inc,aes(x=edyears, y =earnings))+
geom_point()+
labs(
title = "Income wage on years of education",
x="years of education",
y = " income wage"
)
# another quick way of plotting is using qplot function: qplot(usa21_inc$edyears,usa21_inc$INCWAGE) +geom_smooth(method = lm, se=FALSE)
g2<-ggplot(usa21_inc,aes(x=edyears, y =earnings))+ # x and y have to be within the ggplot function for the fitted line to work
geom_point()+
geom_smooth(method = "lm", se=FALSE)+
labs(
title = "Income wage on years of education",
x="years of education",
y = " income wage"
)
# plot the residual on fitted values
# the line of best fit is horizontal: there is is not a pattern to this data. So there is a linear relationship.
g3<-ggplot(usa21_inc, aes(x=yhat, y = uhat))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)+
labs(
title = "Residual on fitted values",
x="fitted_values(yhat)",
y="residuals(uhat)"
)
plot_grid(g1, g2, g3)
## Warning: Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 35466 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 35466 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).
# creating a new variable month
usa21_inc<-usa21_inc%>%
mutate(edmonths = edyears*10) # assuming each year has 10 months of schooling
# regress earning on months of education
Model2<-lm(earnings~edmonths, data = usa21_inc, na.action = na.exclude)
summary(Model2)
##
## Call:
## lm(formula = earnings ~ edmonths, data = usa21_inc, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107252 -34696 -14342 13417 713417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -37553.629 1010.373 -37.17 <2e-16 ***
## edmonths 724.127 6.947 104.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70070 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.09096, Adjusted R-squared: 0.09095
## F-statistic: 1.087e+04 on 1 and 108592 DF, p-value: < 2.2e-16
#create a new variable earn_cents
usa21_inc<-usa21_inc%>%
mutate(earn_cents=earnings*100)
Model3<-lm(earn_cents~edyears, data=usa21_inc)
summary(Model3)
##
## Call:
## lm(formula = earn_cents ~ edyears, data = usa21_inc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10725172 -3469595 -1434158 1341715 71341715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3755363 101037 -37.17 <2e-16 ***
## edyears 724127 6947 104.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7007000 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.09096, Adjusted R-squared: 0.09095
## F-statistic: 1.087e+04 on 1 and 108592 DF, p-value: < 2.2e-16
usa21_inc<-usa21_inc%>%
mutate(earn_5 = earnings*5,
ed_5 =edyears*5)
Model4<-lm(earn_5~ed_5, data = usa21_inc)
summary(Model4)
##
## Call:
## lm(formula = earn_5 ~ ed_5, data = usa21_inc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -536259 -173480 -71708 67086 3567086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -187768.15 5051.86 -37.17 <2e-16 ***
## ed_5 7241.27 69.47 104.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 350400 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.09096, Adjusted R-squared: 0.09095
## F-statistic: 1.087e+04 on 1 and 108592 DF, p-value: < 2.2e-16
usa21_inc<-usa21_inc%>%
mutate(ln_earnings=log(earnings))
Model5<-lm(ln_earnings~edyears, data = usa21_inc)
summary(Model5)
##
## Call:
## lm(formula = ln_earnings ~ edyears, data = usa21_inc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4380 -0.3612 0.1552 0.6057 4.3171
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.042662 0.015213 594.4 <2e-16 ***
## edyears 0.111352 0.001046 106.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.0945, Adjusted R-squared: 0.09449
## F-statistic: 1.133e+04 on 1 and 108592 DF, p-value: < 2.2e-16
# creating a dummy variable for being female and summarize earnings:
usa21_inc%>%
count(sex)
## # A tibble: 2 × 2
## sex n
## <chr> <int>
## 1 female 72045
## 2 male 72015
usa21_inc<-usa21_inc%>%
mutate(female =ifelse(sex=="female", 1,0))
usa21_inc%>%
summarise(mean_female=mean(female))
## # A tibble: 1 × 1
## mean_female
## <dbl>
## 1 0.500
# summarizing using nv_sum()
female_usa21_inc<-usa21_inc%>%
group_by(female)
nv_sum(female_usa21_inc, earnings, weight = FALSE)
## # A tibble: 2 × 8
## female variable Obs min mean median st.dev max
## <dbl> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 earnings 72015 4 76289. 55000 84401. 770000
## 2 1 earnings 72045 4 53621. 40000 57166. 682000
"
#This is the regular summary without using nv_sum function
usa21_inc%>%
group_by(female)%>%
summarise(mean_wage =mean(earnings),
total=n())
"
## [1] "\n#This is the regular summary without using nv_sum function\nusa21_inc%>%\n group_by(female)%>%\n summarise(mean_wage =mean(earnings),\n total=n())\n"
# Regress earnings on the female
Model6<-lm(earnings~female, data = usa21_inc)
summary(Model6)
##
## Call:
## lm(formula = earnings ~ female, data = usa21_inc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76285 -37621 -16289 13711 693711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76289.0 305.6 249.66 <2e-16 ***
## female -22667.8 441.1 -51.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72620 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.02375, Adjusted R-squared: 0.02374
## F-statistic: 2641 on 1 and 108592 DF, p-value: < 2.2e-16
# creating a dummy variable male
usa21_inc<-usa21_inc%>%
mutate(male =ifelse(sex =="male",1, 0))
#summarize earnings
male_usa21_inc<-usa21_inc%>%
group_by(male)
nv_sum(male_usa21_inc, earnings, weight = FALSE)
## # A tibble: 2 × 8
## male variable Obs min mean median st.dev max
## <dbl> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 earnings 72045 4 53621. 40000 57166. 682000
## 2 1 earnings 72015 4 76289. 55000 84401. 770000
# Regress earnings on the male
Model7<-lm(earnings~male, data = usa21_inc)
summary(Model7)
##
## Call:
## lm(formula = earnings ~ male, data = usa21_inc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76285 -37621 -16289 13711 693711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53621.3 318.1 168.59 <2e-16 ***
## male 22667.8 441.1 51.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72620 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.02375, Adjusted R-squared: 0.02374
## F-statistic: 2641 on 1 and 108592 DF, p-value: < 2.2e-16
# regress ln_earnings on female
Model8<-lm(ln_earnings~female, data = usa21_inc)
summary(Model8)
##
## Call:
## lm(formula = ln_earnings ~ female, data = usa21_inc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4141 -0.4381 0.1597 0.6452 2.9959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.800438 0.004603 2346.54 <2e-16 ***
## female -0.363524 0.006644 -54.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.094 on 108592 degrees of freedom
## (35466 observations deleted due to missingness)
## Multiple R-squared: 0.02683, Adjusted R-squared: 0.02682
## F-statistic: 2994 on 1 and 108592 DF, p-value: < 2.2e-16