1. loading data
2. Estimating regression of earning on years
3. generating residuals and fitted values
creating graphs
4. Estimating a regression model of earnings on months of schooling
5. Estimating a regression model of earnings(in cents) on years of education
6. multiplying by 5
7. data transformation using log
8. Estimating a fitted model of log of earnings on years of education.
9. Dummy variable as explainatory variable

#Note: Install all the necessary packages before loading the libraries if they are not installed already ## Important packages (libraries) for this lab:

tidyverse
cowplot

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(cowplot) # to put graphs together

rm(list = ls()) #clear the environment

1. loading data

nv_sum(usa21_inc, incwage, edyears, earnings, weight = FALSE)

## # A tibble: 3 × 7
##   variable    Obs   min    mean median   st.dev    max
##   <chr>     <int> <dbl>   <dbl>  <dbl>    <dbl>  <dbl>
## 1 incwage  144060     0 49306.   34000 69753.   770000
## 2 edyears  144060     0    13.8     13     3.24     20
## 3 earnings 144060     4 65409.   48000 73493.   770000

2. Estimating regression of earning on years

Model1<-lm(earnings~edyears, data = usa21_inc, na.action = na.exclude) # omit N/A's to get predicted values  
summary(Model1)

## 
## Call:
## lm(formula = earnings ~ edyears, data = usa21_inc, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107252  -34696  -14342   13417  713417 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -37553.63    1010.37  -37.17   <2e-16 ***
## edyears       7241.27      69.47  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70070 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

coef(Model1) # pull out the coefficients

## (Intercept)     edyears 
##  -37553.629    7241.268

3. generating residuals and fitted values

usa21_inc$yhat=fitted(Model1) # generate fitted values from Model1
#this also works: #usa21_inc$yhat<-Model1$fitted.values 

usa21_inc$uhat<- residuals(Model1) # generate residual values.

cor(usa21_inc$edyears,usa21_inc$earnings) # correlation between two variables

## [1] NA

#view the predicted variables
usa21_inc%>%
  na.omit()%>%
  dplyr::select(yhat, uhat)

## # A tibble: 108,593 × 2
##      yhat    uhat
##     <dbl>   <dbl>
##  1 34859.  -8859.
##  2 56583. -56313.
##  3 49342. -37342.
##  4 13135.   2865.
##  5 56583. -54983.
##  6 56583. -22583.
##  7 49342. -19342.
##  8 56583. -30583.
##  9 34859. -19459.
## 10 49342. -19342.
## # ℹ 108,583 more rows

creating graphs

g1<-ggplot(usa21_inc,aes(x=edyears, y =earnings))+ 
  geom_point()+
  labs(
    title = "Income wage on years of education",
    x="years of education",
    y = " income wage"
  )

# another quick way of plotting is using qplot function: qplot(usa21_inc$edyears,usa21_inc$INCWAGE) +geom_smooth(method = lm, se=FALSE)
g2<-ggplot(usa21_inc,aes(x=edyears, y =earnings))+ # x and y have to be within the ggplot function for the fitted line to work
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(
    title = "Income wage on years of education",
    x="years of education",
    y = " income wage"
  )
 
# plot the residual on fitted values
# the line of best fit is horizontal: there is is not a pattern to this data. So there is a linear relationship.
g3<-ggplot(usa21_inc, aes(x=yhat, y = uhat))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  labs(
    title = "Residual on fitted values",
    x="fitted_values(yhat)",
    y="residuals(uhat)"
  )
plot_grid(g1, g2, g3)

## Warning: Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 35466 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 35466 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 35466 rows containing missing values or values outside the scale range
## (`geom_point()`).

4. Estimating a regression model of earnings on months of schooling

# creating a new variable month
usa21_inc<-usa21_inc%>%
  mutate(edmonths = edyears*10) # assuming each year has 10 months of schooling

# regress earning on months of education
Model2<-lm(earnings~edmonths, data = usa21_inc, na.action = na.exclude)
summary(Model2)

## 
## Call:
## lm(formula = earnings ~ edmonths, data = usa21_inc, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107252  -34696  -14342   13417  713417 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -37553.629   1010.373  -37.17   <2e-16 ***
## edmonths       724.127      6.947  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70070 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

5. Estimating a regression model of earnings(in cents) on years of education

#create a new variable earn_cents
usa21_inc<-usa21_inc%>%
  mutate(earn_cents=earnings*100)

Model3<-lm(earn_cents~edyears, data=usa21_inc)
summary(Model3)

## 
## Call:
## lm(formula = earn_cents ~ edyears, data = usa21_inc)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -10725172  -3469595  -1434158   1341715  71341715 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3755363     101037  -37.17   <2e-16 ***
## edyears       724127       6947  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7007000 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

6. multiplying by 5

usa21_inc<-usa21_inc%>%
  mutate(earn_5 = earnings*5,
         ed_5 =edyears*5)
Model4<-lm(earn_5~ed_5, data = usa21_inc)
summary(Model4)

## 
## Call:
## lm(formula = earn_5 ~ ed_5, data = usa21_inc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -536259 -173480  -71708   67086 3567086 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -187768.15    5051.86  -37.17   <2e-16 ***
## ed_5           7241.27      69.47  104.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 350400 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.09096,    Adjusted R-squared:  0.09095 
## F-statistic: 1.087e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

7. data transformation using log

usa21_inc<-usa21_inc%>%
  mutate(ln_earnings=log(earnings))

8. Estimating a fitted model of log of earnings on years of education.

Model5<-lm(ln_earnings~edyears, data = usa21_inc)
summary(Model5)

## 
## Call:
## lm(formula = ln_earnings ~ edyears, data = usa21_inc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4380 -0.3612  0.1552  0.6057  4.3171 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.042662   0.015213   594.4   <2e-16 ***
## edyears     0.111352   0.001046   106.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.055 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.0945, Adjusted R-squared:  0.09449 
## F-statistic: 1.133e+04 on 1 and 108592 DF,  p-value: < 2.2e-16

9. Dummy variable as explainatory variable

# creating a dummy variable for being female and summarize earnings:
usa21_inc%>%
  count(sex)

## # A tibble: 2 × 2
##   sex        n
##   <chr>  <int>
## 1 female 72045
## 2 male   72015

usa21_inc<-usa21_inc%>%
  mutate(female =ifelse(sex=="female", 1,0))
usa21_inc%>%
  summarise(mean_female=mean(female))

## # A tibble: 1 × 1
##   mean_female
##         <dbl>
## 1       0.500

# summarizing using nv_sum()
female_usa21_inc<-usa21_inc%>%
 group_by(female)
nv_sum(female_usa21_inc, earnings, weight = FALSE)

## # A tibble: 2 × 8
##   female variable   Obs   min   mean median st.dev    max
##    <dbl> <chr>    <int> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1      0 earnings 72015     4 76289.  55000 84401. 770000
## 2      1 earnings 72045     4 53621.  40000 57166. 682000

"
#This is the regular summary without using nv_sum function
usa21_inc%>%
  group_by(female)%>%
  summarise(mean_wage =mean(earnings),
            total=n())
"

## [1] "\n#This is the regular summary without using nv_sum function\nusa21_inc%>%\n  group_by(female)%>%\n  summarise(mean_wage =mean(earnings),\n            total=n())\n"

# Regress earnings on the female
Model6<-lm(earnings~female, data = usa21_inc)
summary(Model6)

## 
## Call:
## lm(formula = earnings ~ female, data = usa21_inc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -76285 -37621 -16289  13711 693711 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  76289.0      305.6  249.66   <2e-16 ***
## female      -22667.8      441.1  -51.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72620 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.02375,    Adjusted R-squared:  0.02374 
## F-statistic:  2641 on 1 and 108592 DF,  p-value: < 2.2e-16

# creating a dummy variable male
usa21_inc<-usa21_inc%>%
  mutate(male =ifelse(sex =="male",1, 0))

#summarize earnings
male_usa21_inc<-usa21_inc%>%
  group_by(male)
nv_sum(male_usa21_inc, earnings, weight = FALSE)

## # A tibble: 2 × 8
##    male variable   Obs   min   mean median st.dev    max
##   <dbl> <chr>    <int> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1     0 earnings 72045     4 53621.  40000 57166. 682000
## 2     1 earnings 72015     4 76289.  55000 84401. 770000

# Regress earnings on the male
Model7<-lm(earnings~male, data = usa21_inc)
summary(Model7)

## 
## Call:
## lm(formula = earnings ~ male, data = usa21_inc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -76285 -37621 -16289  13711 693711 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  53621.3      318.1  168.59   <2e-16 ***
## male         22667.8      441.1   51.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72620 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.02375,    Adjusted R-squared:  0.02374 
## F-statistic:  2641 on 1 and 108592 DF,  p-value: < 2.2e-16

# regress ln_earnings on female
Model8<-lm(ln_earnings~female, data = usa21_inc)
summary(Model8)

## 
## Call:
## lm(formula = ln_earnings ~ female, data = usa21_inc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4141 -0.4381  0.1597  0.6452  2.9959 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.800438   0.004603 2346.54   <2e-16 ***
## female      -0.363524   0.006644  -54.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.094 on 108592 degrees of freedom
##   (35466 observations deleted due to missingness)
## Multiple R-squared:  0.02683,    Adjusted R-squared:  0.02682 
## F-statistic:  2994 on 1 and 108592 DF,  p-value: < 2.2e-16

Regression Analysis (Econ 0211), Lab3: Unit of measurement, functional form, and summy regressors

Noe Vyizigiro

March, 2024

1. loading data

2. Estimating regression of earning on years

3. generating residuals and fitted values

creating graphs

4. Estimating a regression model of earnings on months of schooling

5. Estimating a regression model of earnings(in cents) on years of education

6. multiplying by 5

7. data transformation using log

8. Estimating a fitted model of log of earnings on years of education.

9. Dummy variable as explainatory variable