This is a second version of the lab 6 where we use the dataset imported from the stata version of this lab. Using this dataset corrects all the slight mismatch between the r and stata results
#Load libraries
ATUS<-read.csv("IPUMS_ATUS_2019.csv")
source("/Volumes/middfiles/Classes/Fall23/ECON0211A/Noe_Labs/NV.R") #load NV.R to access nv_sum() function for quick summary statistic table
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
ATUS<-ATUS%>%
mutate(codeduc=with(ATUS, match(educ, unique(educ))))
ATUS%>%
group_by(educ, codeduc)%>%
summarise(mean_l<-mean(leisure))
## `summarise()` has grouped output by 'educ'. You can override using the
## `.groups` argument.
## # A tibble: 17 × 3
## # Groups: educ [17]
## educ codeduc mean_l <- mean(leisu…¹
## <chr> <int> <dbl>
## 1 10th grade 1 407.
## 2 11th grade 4 395.
## 3 12th grade - no diploma 5 401.
## 4 1st, 2nd, 3rd, or 4th grade 15 358.
## 5 5th or 6th grade 8 362.
## 6 7th or 8th grade 2 431.
## 7 9th grade 3 367.
## 8 associate degree - academic program 9 328.
## 9 associate degree - occupational vocational 13 341.
## 10 bachelor's degree (ba, ab, bs, etc.) 11 311.
## 11 doctoral degree (phd, edd, etc.) 10 302.
## 12 high school graduate - diploma 6 373.
## 13 high school graduate - ged 12 394.
## 14 less than 1st grade 17 350.
## 15 master's degree (ma, ms, meng, med, msw, etc.) 14 293.
## 16 professional school degree (md, dds, dvm, etc… 16 295.
## 17 some college but no degree 7 360.
## # ℹ abbreviated name: ¹​`mean_l <- mean(leisure)`
#encode
ATUS<-ATUS%>%
mutate(edyearF=factor(codeduc, c(1,4,5,15,8,2,3,9,13,11,10,6,12,17,14,16,7),
labels=c("10","11","11", "3", "6", "7", "9", "14","14","16", "20","12", "12","0", "18","18", "14")),
edyear=as.numeric(as.character(edyearF)))
#summarize to check the match
ATUS%>%
group_by(educ, edyear)%>%
summarise(mean_l = mean(leisure))
## `summarise()` has grouped output by 'educ'. You can override using the
## `.groups` argument.
## # A tibble: 17 × 3
## # Groups: educ [17]
## educ edyear mean_l
## <chr> <dbl> <dbl>
## 1 10th grade 10 407.
## 2 11th grade 11 395.
## 3 12th grade - no diploma 11 401.
## 4 1st, 2nd, 3rd, or 4th grade 3 358.
## 5 5th or 6th grade 6 362.
## 6 7th or 8th grade 7 431.
## 7 9th grade 9 367.
## 8 associate degree - academic program 14 328.
## 9 associate degree - occupational vocational 14 341.
## 10 bachelor's degree (ba, ab, bs, etc.) 16 311.
## 11 doctoral degree (phd, edd, etc.) 20 302.
## 12 high school graduate - diploma 12 373.
## 13 high school graduate - ged 12 394.
## 14 less than 1st grade 0 350.
## 15 master's degree (ma, ms, meng, med, msw, etc.) 18 293.
## 16 professional school degree (md, dds, dvm, etc.) 18 295.
## 17 some college but no degree 14 360.
#check employment
ATUS%>%
group_by(empstat)%>%
summarise(mean_emp=mean(leisure))
## # A tibble: 5 × 2
## empstat mean_emp
## <chr> <dbl>
## 1 employed - absent 362.
## 2 employed - at work 272.
## 3 not in labor force 448.
## 4 unemployed - looking 381.
## 5 unemployed - on layoff 403.
#check kidund18
ATUS%>%
group_by(kidund18)%>%
summarise(mean_kid=mean(leisure))
## # A tibble: 2 × 2
## kidund18 mean_kid
## <chr> <dbl>
## 1 no 381.
## 2 yes 249.
ATUS<-ATUS%>%
mutate(LTHS =ifelse(edyear<12,1,0),
HS = ifelse(edyear==12,1,0),
SC = ifelse(edyear>12 & edyear<16, 1, 0),
BA = ifelse(edyear==16,1,0),
GRAD=ifelse(edyear>16,1,0),
employed =ifelse(empstat=="employed - at work",1,0),
male =ifelse(sex=="male",1,0),
child=ifelse(kidund18=="yes",1,0))
#summarize the 10 variables; using nv_sum()
nv_sum(ATUS, LTHS, HS, SC, BA, GRAD, employed, male, child, leisure, age, weight = FALSE)
## variable Obs min mean median st.dev max
## 1 LTHS 9435 0 0.1062003 0 0.3081102 1
## 2 HS 9435 0 0.2301007 0 0.4209194 1
## 3 SC 9435 0 0.2682565 0 0.4430753 1
## 4 BA 9435 0 0.2331744 0 0.4228747 1
## 5 GRAD 9435 0 0.1622682 0 0.3687162 1
## 6 employed 9435 0 0.5775305 1 0.4939786 1
## 7 male 9435 0 0.4569157 0 0.4981667 1
## 8 child 9435 0 0.2960254 0 0.4565265 1
## 9 leisure 9435 0 342.0162162 305 229.0125318 1300
## 10 age 9435 15 51.0865925 51 18.1744901 85
# select variables to summarise
selected_vars <-dplyr::select(ATUS,LTHS, HS,SC, BA, GRAD, employed, male,child, leisure, age)
# Create a survey design object, from survey package
sv_obj <- svydesign(id = ~1, weights = ~wt06, data = ATUS) # id: formula specifying cluster id; ~1 indicate that there is no cluster; weights: formula specifying the sampling weights
# Function to calculate weighted mean and standard error
summary_stats <-function(x,y) { # supply function iterate through the selected variables x while using the weights in the sv_object (y)
c(
obs =sum(!is.na(x)),
mean=svymean(x,y, na.rm=TRUE),#calculate the weighted mean
Std.dev = sqrt(svyvar(x,y, na.rm = TRUE)),# calculate the weighted SD
min = min(x, na.rm = TRUE),
max =max(x, na.rm = TRUE)
# quant=svyquantile(x,y, 0.50, na.rm=TRUE)
)
}
sum_table<-sapply(selected_vars, summary_stats, sv_obj) # use the sapply function here to the whole created function to create a vector-type data
#sum_table
# Convert to a data frame for better presentation
results_df <- as.data.frame(t(sum_table))
# Print the results
print(results_df)
## obs mean Std.dev min max
## LTHS 9435 0.1432380 0.3503339 0 1
## HS 9435 0.2748641 0.4464694 0 1
## SC 9435 0.2417681 0.4281772 0 1
## BA 9435 0.2075264 0.4055572 0 1
## GRAD 9435 0.1326034 0.3391636 0 1
## employed 9435 0.6045367 0.4889759 0 1
## male 9435 0.4843585 0.4997818 0 1
## child 9435 0.2557646 0.4363132 0 1
## leisure 9435 311.1453426 221.0180079 0 1300
## age 9435 46.1374554 18.9997780 15 85
m1<-lm(leisure~HS+ SC+ BA +GRAD, data = ATUS)
#summary(m1)
m2<-lm(leisure~HS+SC+ BA+ GRAD, data =ATUS, weights = wt06) # non robust regression
#summary(m2)
#robust regression
m3<-lm(leisure~HS+SC+ BA+ GRAD, data =ATUS, weights = wt06)
#rob<-coeftest(m3, vcov = vcovHC(m3, "HC1")) # robust; HC1 (Stata default)
rob_se3<-sqrt(diag(vcovHC(m3,"HC1"))) #calculate robust standard error
#rob_se3
stargazer( m1,m2,m3,
se =list(NULL, NULL, rob_se3),
type = "text",
title = "Table 1",
out = "table1.text",
column.labels = c("Regular", "Weighted", "Robust SE 1"),
notes = "Significance level"
)
##
## Table 1
## ==================================================================
## Dependent variable:
## ----------------------------------
## leisure
## Regular Weighted Robust SE 1
## (1) (2) (3)
## ------------------------------------------------------------------
## HS -17.862** -0.331 -0.331
## (8.653) (7.355) (11.437)
##
## SC -43.569*** -30.079*** -30.079***
## (8.457) (7.525) (11.249)
##
## BA -82.241*** -58.748*** -58.748***
## (8.635) (7.753) (10.762)
##
## GRAD -98.076*** -75.240*** -75.240***
## (9.207) (8.601) (10.911)
##
## Constant 392.905*** 340.677*** 340.677***
## (7.158) (5.963) (9.430)
##
## ------------------------------------------------------------------
## Observations 9,435 9,435 9,435
## R2 0.022 0.017 0.017
## Adjusted R2 0.021 0.016 0.016
## Residual Std. Error (df = 9430) 226.578 699,720.800 699,720.800
## F Statistic (df = 4; 9430) 51.960*** 39.932*** 39.932***
## ==================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## Significance level
m1<-lm(leisure~HS + SC + BA + GRAD, data =ATUS, weights = wt06)
r_se1<-sqrt(diag(vcovHC(m1,"HC1")))
m2<-lm(leisure~HS + SC + BA + GRAD + employed, data =ATUS, weights = wt06)
r_se2<-sqrt(diag(vcovHC(m2,"HC1")))
m3<-lm(leisure~HS + SC + BA + GRAD + employed + male, data =ATUS, weights = wt06)
r_se3<-sqrt(diag(vcovHC(m3,"HC1")))
m4<-lm(leisure~HS + SC + BA + GRAD + employed + male + child, data =ATUS, weights = wt06)
r_se4<-sqrt(diag(vcovHC(m4,"HC1")))
m5<-lm(leisure~HS + SC + BA + GRAD + employed + male + child + age, data =ATUS, weights = wt06)
r_se5<-sqrt(diag(vcovHC(m5,"HC1")))
stargazer(m1, m2, m3, m4,m5,
se=list(r_se1, r_se2, r_se3, r_se4, r_se5),
type = "text",
title = "Table 2: Weighted Regression with Robust Standard Error.",
out="table2.txt",
notes.append = TRUE)
##
## Table 2: Weighted Regression with Robust Standard Error.
## ====================================================================================================================================================
## Dependent variable:
## --------------------------------------------------------------------------------------------------------------------------------
## leisure
## (1) (2) (3) (4) (5)
## ----------------------------------------------------------------------------------------------------------------------------------------------------
## HS -0.331 28.668*** 30.404*** 30.436*** 5.966
## (11.437) (10.959) (10.863) (10.925) (10.984)
##
## SC -30.079*** 9.687 15.483 15.315 -4.164
## (11.249) (11.145) (11.180) (11.290) (11.371)
##
## BA -58.748*** -7.716 -2.924 0.552 -21.531**
## (10.762) (10.547) (10.525) (10.630) (10.630)
##
## GRAD -75.240*** -20.940** -15.439 -9.249 -39.215***
## (10.911) (10.633) (10.595) (10.753) (10.880)
##
## employed -161.779*** -170.667*** -156.435*** -138.178***
## (6.119) (6.211) (6.242) (6.762)
##
## male 61.444*** 57.906*** 58.051***
## (5.762) (5.738) (5.627)
##
## child -75.699*** -64.990***
## (5.686) (6.022)
##
## age 1.584***
## (0.183)
##
## Constant 340.677*** 403.103*** 375.113*** 386.073*** 319.120***
## (9.430) (9.564) (9.846) (9.943) (13.123)
##
## ----------------------------------------------------------------------------------------------------------------------------------------------------
## Observations 9,435 9,435 9,435 9,435 9,435
## R2 0.017 0.139 0.158 0.179 0.194
## Adjusted R2 0.016 0.138 0.157 0.178 0.193
## Residual Std. Error 699,720.800 (df = 9430) 654,876.200 (df = 9429) 647,687.500 (df = 9428) 639,544.700 (df = 9427) 633,621.700 (df = 9426)
## F Statistic 39.932*** (df = 4; 9430) 303.814*** (df = 5; 9429) 294.074*** (df = 6; 9428) 293.181*** (df = 7; 9427) 283.610*** (df = 8; 9426)
## ====================================================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#b
m1<-lm(leisure~ HS + SC + BA + GRAD + employed + age + male + child, data = ATUS, weights = wt06)
r_seM1<-sqrt(diag(vcovHC.default(m1, "HC1")))
summary(m1)
##
## Call:
## lm(formula = leisure ~ HS + SC + BA + GRAD + employed + age +
## male + child, data = ATUS, weights = wt06)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4739440 -356313 -7432 378327 5288153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 319.1197 7.8162 40.828 < 2e-16 ***
## HS 5.9661 6.9511 0.858 0.3908
## SC -4.1645 7.0585 -0.590 0.5552
## BA -21.5313 7.3484 -2.930 0.0034 **
## GRAD -39.2150 8.2495 -4.754 2.03e-06 ***
## employed -138.1779 4.6237 -29.885 < 2e-16 ***
## age 1.5844 0.1187 13.344 < 2e-16 ***
## male 58.0507 4.1397 14.023 < 2e-16 ***
## child -64.9902 4.8814 -13.314 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 633600 on 9426 degrees of freedom
## Multiple R-squared: 0.194, Adjusted R-squared: 0.1933
## F-statistic: 283.6 on 8 and 9426 DF, p-value: < 2.2e-16
r_seM1
## (Intercept) HS SC BA GRAD employed
## 13.1227179 10.9839615 11.3708467 10.6299208 10.8803269 6.7622559
## age male child
## 0.1833738 5.6266617 6.0219273
t1<-linearHypothesis(m1, "BA =GRAD")
print(t1)
## Linear hypothesis test
##
## Hypothesis:
## BA - GRAD = 0
##
## Model 1: restricted model
## Model 2: leisure ~ HS + SC + BA + GRAD + employed + age + male + child
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9427 3.7867e+15
## 2 9426 3.7843e+15 1 2.4139e+12 6.0126 0.01422 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#c
t2<-linearHypothesis(m1, c("HS", "SC", "BA", "GRAD"))
print(t2)
## Linear hypothesis test
##
## Hypothesis:
## HS = 0
## SC = 0
## BA = 0
## GRAD = 0
##
## Model 1: restricted model
## Model 2: leisure ~ HS + SC + BA + GRAD + employed + age + male + child
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9430 3.8057e+15
## 2 9426 3.7843e+15 4 2.1406e+13 13.329 7.841e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#a
linearHypothesis(m1, c("child =0", "employed=0"))
## Linear hypothesis test
##
## Hypothesis:
## child = 0
## employed = 0
##
## Model 1: restricted model
## Model 2: leisure ~ HS + SC + BA + GRAD + employed + age + male + child
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9428 4.2703e+15
## 2 9426 3.7843e+15 2 4.8603e+14 605.3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
linearHypothesis(m1, "child-employed=0") # or linearHypothesis(m1, "child=employed =0)
## Linear hypothesis test
##
## Hypothesis:
## - employed + child = 0
##
## Model 1: restricted model
## Model 2: leisure ~ HS + SC + BA + GRAD + employed + age + male + child
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9427 3.8259e+15
## 2 9426 3.7843e+15 1 4.1573e+13 103.55 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
linearHypothesis(m1, "2*child =employed")
## Linear hypothesis test
##
## Hypothesis:
## - employed + 2 child = 0
##
## Model 1: restricted model
## Model 2: leisure ~ HS + SC + BA + GRAD + employed + age + male + child
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9427 3.7845e+15
## 2 9426 3.7843e+15 1 2.0795e+11 0.518 0.4717
#b
linearHypothesis(m1, c("HS", "SC", "BA", "GRAD", "employed", "age", "male", "child "))
## Linear hypothesis test
##
## Hypothesis:
## HS = 0
## SC = 0
## BA = 0
## GRAD = 0
## employed = 0
## age = 0
## male = 0
## child = 0
##
## Model 1: restricted model
## Model 2: leisure ~ HS + SC + BA + GRAD + employed + age + male + child
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9434 4.6952e+15
## 2 9426 3.7843e+15 8 9.109e+14 283.61 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#c
ggplot(ATUS, aes(age, leisure))+
geom_point()+
labs(
title = "scatter plot of age and Leisure"
)
#Scatter plot is not helpful here, we can try something else
sv_ob <- svydesign(id = ~1, weights = ~wt06, data = ATUS)
sum<-ATUS%>%
group_by(age)%>%
summarise(mean_leisure = weighted.mean(leisure, wt06),
n=n())
#sum
#generate a variable that is the average leisure by age
ATUS <-ATUS%>%
group_by(age)%>%
mutate(leisure_byAge = mean(leisure))
ggplot(ATUS, aes(age, leisure_byAge))+
geom_line()+
labs(
title = "Leisure by age and Age"
)
# regress leisure on age
m1<-lm(leisure~age, data = ATUS, weights = wt06)
summary(m1)
##
## Call:
## lm(formula = leisure ~ age, data = ATUS, weights = wt06)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2798926 -435886 -44055 403995 4575406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 183.7885 5.8054 31.66 <2e-16 ***
## age 2.7604 0.1164 23.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 685400 on 9433 degrees of freedom
## Multiple R-squared: 0.05631, Adjusted R-squared: 0.05621
## F-statistic: 562.9 on 1 and 9433 DF, p-value: < 2.2e-16
#robust
rm1<-coeftest(m1,vcov=vcovHC(m1,"HC1")) #robust coefficient and se
se_m1<-sqrt(diag(vcovHC(m1, "HC1")))
#atus19$pred_leisure<-predict(m1)
ATUS$fitted_values<-fitted(m1)
ggplot(ATUS, aes(age, leisure_byAge))+
geom_line(aes(color ="Leisure_byAge"))+
geom_line(aes(y= fitted_values, color ="fitted_values"))+
labs(
title = "Predicted values"
)
#d
ATUS<-ATUS%>%
mutate(age2 =age^2)
for (i in 2:5){
new_val_name <- paste0("age", i)
ATUS[[new_val_name]]<-ATUS$age^i
}
m1<-lm(leisure~ age, weights = wt06, data = ATUS)
ATUS$pred_yht1<-predict(m1)
m1se<-sqrt(diag(vcovHC.default(m1, "HC1")))
m2<-lm(leisure~ age+ age2, weights = wt06, data = ATUS)
ATUS$pred_yht2<-predict(m2)
m2se<-sqrt(diag(vcovHC.default(m2, "HC1")))
m3<-lm(leisure~ age + age2 + age3, weights = wt06, data = ATUS)
ATUS$pred_yht3<-predict(m3)
m3se<-sqrt(diag(vcovHC.default(m3, "HC1")))
m4<-lm(leisure~ age + age2 + age3 + age4, weights = wt06, data = ATUS)
ATUS$pred_yht4<-predict(m4)
m4se<-sqrt(diag(vcovHC.default(m4,"HC1")))
stargazer(m1, m2, m3, m4,
se =list(m1se, m2se, m3se, m4se),
type = "text",
title = "Table 3.",
out="table3.txt",
notes.append = TRUE)
##
## Table 3.
## ===========================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------------------------------
## leisure
## (1) (2) (3) (4)
## ---------------------------------------------------------------------------------------------------------------------------
## age 2.760*** -9.975*** -24.276*** 6.979
## (0.163) (0.851) (3.090) (10.123)
##
## age2 0.134*** 0.460*** -0.670*
## (0.008) (0.066) (0.346)
##
## age3 -0.002*** 0.014***
## (0.0004) (0.005)
##
## age4 -0.0001***
## (0.00002)
##
## Constant 183.789*** 438.048*** 619.588*** 329.830***
## (8.763) (20.252) (44.685) (101.663)
##
## ---------------------------------------------------------------------------------------------------------------------------
## Observations 9,435 9,435 9,435 9,435
## R2 0.056 0.100 0.105 0.107
## Adjusted R2 0.056 0.100 0.104 0.106
## Residual Std. Error 685,358.600 (df = 9433) 669,183.900 (df = 9432) 667,644.500 (df = 9431) 666,942.400 (df = 9430)
## F Statistic 562.858*** (df = 1; 9433) 526.458*** (df = 2; 9432) 367.441*** (df = 3; 9431) 281.377*** (df = 4; 9430)
## ===========================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
ggplot(ATUS, aes(age, leisure_byAge, color = "leisure_byAge"))+
geom_line()+
geom_line(aes(y =pred_yht1, color ="pred_yht1"))+
labs(
title = "Predicted values"
)
ggplot(ATUS, aes(age, leisure_byAge, color = "leisure_byAge"))+
geom_line()+
geom_line(aes(y =pred_yht1, color ="pred_yht1"))+
geom_line(aes(y =pred_yht2, color ="pred_yht2"))+
geom_line(aes(y =pred_yht3, color ="pred_yht3"))+
geom_line(aes(y =pred_yht4, color ="pred_yht4"))+
labs(
title = "Predicted values"
)
ggplot(ATUS, aes(age, leisure_byAge, linetype = "leisure_byAge", color="leisure_byAge"))+
geom_line()+
geom_line(aes(y =pred_yht1, linetype ="linear",color = "linear"))+
geom_line(aes(y =pred_yht2,linetype ="quadratic",color="quadratic"))+
geom_line(aes(y =pred_yht3, linetype ="cubic", color="cubic"))+
geom_line(aes(y =pred_yht4, linetype ="quartic",color="quartic"))+
scale_linetype_manual(values = c("linear"="dashed", "quadratic"="dotted", "cubic"="twodash", "quartic"="dotdash", "leisure_byAge"="solid"))+
scale_color_manual(values = c("linear"="blue", "quadratic"="green", "cubic"="purple", "quartic"="red", "leisure_byAge"="black"))+
labs(
title = "Predicted values",
linetype ="Model", # line type will merge with color type to created an annotated, colored legend based on linetype and color type
color ="Model",
y="Leisure by Age"
)+
theme_minimal()