Important packages/libraries for this lab:

  1. tidyverse
  2. stargazer
  3. sandwich: robust st.err
  4. broom: extract coefficients/margins from linear models
  5. lmtest: model tests
  6. mfx: computing marginal effects for nonlinear models

1. Read data

#2 generate dummy variables

titanic<-titanic%>%
  mutate(female =ifelse(gender =="female", 1,0),
         child = ifelse(age<16,1,0),
         survive =ifelse(survived=="Survive", 1,0),
         class1 =ifelse(pclass==1,1,0),
         class2 =ifelse(pclass==2,1,0),
         class3 =ifelse(pclass==3,1,0)
         )
#titanic<-titanic%>%
  #na.omit()

3. descriptive statistic

summary_table<-titanic%>%
  group_by(pclass, gender)%>%
  summarise(mean_survived = mean(survive, na.rm=TRUE),
            )%>%
  pivot_wider(names_from = pclass,
              values_from = mean_survived)
## `summarise()` has grouped output by 'pclass'. You can override using the
## `.groups` argument.
print(summary_table)
## # A tibble: 2 × 4
##   gender   `1`   `2`   `3`
##   <chr>  <dbl> <dbl> <dbl>
## 1 female 0.965 0.887 0.491
## 2 male   0.341 0.146 0.152

4. Regression

m1<-lm(survive~child+female+class1+class2, data=titanic)
m_se1<-sqrt(diag(vcovHC(m1,"HC1"))) #robust se
summary(m1)
## 
## Call:
## lm(formula = survive ~ child + female + class1 + class2, data = titanic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09083 -0.22360 -0.07999  0.27943  0.92001 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.07999    0.01991   4.018 6.30e-05 ***
## child        0.19476    0.03966   4.910 1.06e-06 ***
## female       0.49697    0.02557  19.436  < 2e-16 ***
## class1       0.31911    0.03000  10.636  < 2e-16 ***
## class2       0.14361    0.03016   4.762 2.19e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3928 on 1041 degrees of freedom
##   (263 observations deleted due to missingness)
## Multiple R-squared:  0.3644, Adjusted R-squared:  0.362 
## F-statistic: 149.2 on 4 and 1041 DF,  p-value: < 2.2e-16

5. Estimate linear probability model

m2<-lm(survive~child+female+class1+class2, data=titanic, na.action = na.exclude)
m_se2<-sqrt(diag(vcovHC(m2,"HC1"))) #robust se

stargazer(m2,
          se =list(m_se2),
          type = "text",
          out = "LPM.txt",
          title = "LPM",
          notes.append = TRUE)
## 
## LPM
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               survive          
## -----------------------------------------------
## child                        0.195***          
##                               (0.048)          
##                                                
## female                       0.497***          
##                               (0.027)          
##                                                
## class1                       0.319***          
##                               (0.032)          
##                                                
## class2                       0.144***          
##                               (0.029)          
##                                                
## Constant                     0.080***          
##                               (0.019)          
##                                                
## -----------------------------------------------
## Observations                   1,046           
## R2                             0.364           
## Adjusted R2                    0.362           
## Residual Std. Error      0.393 (df = 1041)     
## F Statistic          149.226*** (df = 4; 1041) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

6. Check predicted outcome

summary(m2)
## 
## Call:
## lm(formula = survive ~ child + female + class1 + class2, data = titanic, 
##     na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09083 -0.22360 -0.07999  0.27943  0.92001 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.07999    0.01991   4.018 6.30e-05 ***
## child        0.19476    0.03966   4.910 1.06e-06 ***
## female       0.49697    0.02557  19.436  < 2e-16 ***
## class1       0.31911    0.03000  10.636  < 2e-16 ***
## class2       0.14361    0.03016   4.762 2.19e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3928 on 1041 degrees of freedom
##   (263 observations deleted due to missingness)
## Multiple R-squared:  0.3644, Adjusted R-squared:  0.362 
## F-statistic: 149.2 on 4 and 1041 DF,  p-value: < 2.2e-16
titanic$yhat<-predict(m2) # predict yhat
nv_sum(titanic, yhat, weight=FALSE) # summarise yhat
## # A tibble: 1 × 7
##   variable   Obs    min  mean median st.dev   max
##   <chr>    <int>  <dbl> <dbl>  <dbl>  <dbl> <dbl>
## 1 yhat      1309 0.0800 0.408  0.399  0.297  1.09

7. Probit Regression Model

p_model<-glm(survive~child+female+class1+class2,family = binomial(link = "probit"), data = titanic)
#summary(p_model)
logLik(p_model) # log Likelihood value
## 'log Lik.' -497.2332 (df=5)
lrtest(p_model) # test of overall significance of the model, including the log likelihood value
## Likelihood ratio test
## 
## Model 1: survive ~ child + female + class1 + class2
## Model 2: survive ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   5 -497.23                         
## 2   1 -707.31 -4 420.15  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
l_model<-glm(survive~child+female+class1+class2,family = binomial(link = "logit"), data = titanic)
#summary(l_model)
logLik(l_model) # log Likelihood value
## 'log Lik.' -495.7903 (df=5)
stargazer(p_model, l_model,
          type = "text",
          out = "probability.txt",
          title = "Probability Model",
          notes.append = TRUE
  
)
## 
## Probability Model
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                             survive           
##                       probit       logistic   
##                        (1)            (2)     
## ----------------------------------------------
## child                0.628***      1.171***   
##                      (0.144)        (0.251)   
##                                               
## female               1.490***      2.508***   
##                      (0.093)        (0.165)   
##                                               
## class1               1.085***      1.907***   
##                      (0.112)        (0.198)   
##                                               
## class2               0.489***      0.924***   
##                      (0.111)        (0.197)   
##                                               
## Constant            -1.313***      -2.304***  
##                      (0.083)        (0.158)   
##                                               
## ----------------------------------------------
## Observations          1,046          1,046    
## Log Likelihood       -497.233      -495.790   
## Akaike Inf. Crit.   1,004.466      1,001.581  
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01

Computing margins/Partial Effects

To calculate the margins, we use the function logitmfx for a logit model and probitmfx for a probit model. Use tidy function from broom package to extract marginal effects for linear probability model

#linear
lpm<-lm(survive~child+female+class1+class2, data=titanic, na.action = na.exclude)
lpm_tidy<-tidy(lpm)
lme <- lpm_tidy$estimate #extract margins (coefficients)
lse <- lpm_tidy$std.error #extract st err

margins(lpm) # coefficients of the lpm
## Average marginal effects
## lm(formula = survive ~ child + female + class1 + class2, data = titanic,     na.action = na.exclude)
##   child female class1 class2
##  0.1948  0.497 0.3191 0.1436
#probit
pm<-glm(survive~child+female+class1+class2,family = binomial(link = "probit"), data = titanic) # probability model

atmp<-probitmfx(survive~child+female+class1+class2, data = titanic, atmean = TRUE) # Marginal Effect at the average/ Partial Effect Average (PEA)
atmep<-atmp$mfxest[,"dF/dx"] #margins
atse<-atmp$mfxest[,"Std. Err."] #st.errors

avmp<-probitmfx(survive~child+female+class1+class2, data = titanic, atmean = FALSE) # Average Marginal effect /Average Partial Effect (APE)
avmep<-avmp$mfxest[,"dF/dx"] #margins
avse<-avmp$mfxest[,"Std. Err."] #st.error

#logit
lm<-glm(survive~child+female+class1+class2,family = binomial(link = "logit"), data = titanic) # logit model

atml<-logitmfx(survive~child+female+class1+class2, data = titanic, atmean = TRUE) # Marginal Effect at the average/ Partial Effect Average (PEA)
atmel<-atml$mfxest[,"dF/dx"] #margins
atsel<-atml$mfxest[,"Std. Err."] # st.error

avml<-logitmfx(survive~child+female+class1+class2, data = titanic, atmean = FALSE) # Average Marginal effect /Average Partial Effect (APE)
avmel<-avml$mfxest[,"dF/dx"] # margins
avsel<-avml$mfxest[,"Std. Err."] # st.errors


stargazer(lpm,lpm, pm,pm,pm,lm,lm,lm,
          coef = list(NULL,lme,NULL, atmep, avmep,NULL, atmel, avmel),
          se = list(NULL,lse, NULL, atse,avse,NULL,atsel,avsel),
          type = "text",
          out = "probability.txt",
          title = "Probability Model",
          notes.append = TRUE)
## 
## Probability Model
## =================================================================================================================
##                                                                Dependent variable:                               
##                                 ---------------------------------------------------------------------------------
##                                                                      survive                                     
##                                          OLS                     probit                       logistic           
##                                    (1)        (2)        (3)       (4)       (5)       (6)       (7)       (8)   
## -----------------------------------------------------------------------------------------------------------------
## child                            0.195***   0.195***  0.628***  0.246***  0.176***  1.171***  0.285***  0.189*** 
##                                  (0.040)    (0.040)    (0.144)   (0.055)   (0.041)   (0.251)   (0.058)   (0.041) 
##                                                                                                                  
## female                           0.497***   0.497***  1.490***  0.542***  0.497***  2.508***  0.553***  0.495*** 
##                                  (0.026)    (0.026)    (0.093)   (0.029)   (0.028)   (0.165)   (0.029)   (0.028) 
##                                                                                                                  
## class1                           0.319***   0.319***  1.085***  0.412***  0.317***  1.907***  0.443***  0.323*** 
##                                  (0.030)    (0.030)    (0.112)   (0.039)   (0.031)   (0.198)   (0.040)   (0.031) 
##                                                                                                                  
## class2                           0.144***   0.144***  0.489***  0.191***  0.132***  0.924***  0.223***  0.144*** 
##                                  (0.030)    (0.030)    (0.111)   (0.043)   (0.030)   (0.197)   (0.047)   (0.030) 
##                                                                                                                  
## Constant                         0.080***   0.080***  -1.313***                     -2.304***                    
##                                  (0.020)    (0.020)    (0.083)                       (0.158)                     
##                                                                                                                  
## -----------------------------------------------------------------------------------------------------------------
## Observations                      1,046      1,046      1,046     1,046     1,046     1,046     1,046     1,046  
## R2                                0.364      0.364                                                               
## Adjusted R2                       0.362      0.362                                                               
## Log Likelihood                                        -497.233  -497.233  -497.233  -495.790  -495.790  -495.790 
## Akaike Inf. Crit.                                     1,004.466 1,004.466 1,004.466 1,001.581 1,001.581 1,001.581
## Residual Std. Error (df = 1041)   0.393      0.393                                                               
## F Statistic (df = 4; 1041)      149.226*** 149.226***                                                            
## =================================================================================================================
## Note:                                                                                 *p<0.1; **p<0.05; ***p<0.01

we can make some predictions of survival based on different characteristics

lpm <- lm(survive ~ child + female + class1 + class2, data = titanic)
print("LPM")
## [1] "LPM"
summary(lpm)
## 
## Call:
## lm(formula = survive ~ child + female + class1 + class2, data = titanic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09083 -0.22360 -0.07999  0.27943  0.92001 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.07999    0.01991   4.018 6.30e-05 ***
## child        0.19476    0.03966   4.910 1.06e-06 ***
## female       0.49697    0.02557  19.436  < 2e-16 ***
## class1       0.31911    0.03000  10.636  < 2e-16 ***
## class2       0.14361    0.03016   4.762 2.19e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3928 on 1041 degrees of freedom
##   (263 observations deleted due to missingness)
## Multiple R-squared:  0.3644, Adjusted R-squared:  0.362 
## F-statistic: 149.2 on 4 and 1041 DF,  p-value: < 2.2e-16
lpm_margins <- margins(lpm, at = list(child = 1, female = 1, class1 = 0, class2 = 0))
print(lpm_margins)
## Average marginal effects at specified values
## lm(formula = survive ~ child + female + class1 + class2, data = titanic)
##  at(child) at(female) at(class1) at(class2)  child female class1 class2
##          1          1          0          0 0.1948  0.497 0.3191 0.1436
#  using probit
summary(pm)
## 
## Call:
## glm(formula = survive ~ child + female + class1 + class2, family = binomial(link = "probit"), 
##     data = titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6562  -0.6769  -0.4457   0.7636   2.1720  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.31328    0.08326 -15.774  < 2e-16 ***
## child        0.62822    0.14393   4.365 1.27e-05 ***
## female       1.49005    0.09326  15.978  < 2e-16 ***
## class1       1.08514    0.11151   9.732  < 2e-16 ***
## class2       0.48858    0.11131   4.389 1.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.62  on 1045  degrees of freedom
## Residual deviance:  994.47  on 1041  degrees of freedom
##   (263 observations deleted due to missingness)
## AIC: 1004.5
## 
## Number of Fisher Scoring iterations: 5
probit_prediction <- pnorm(-1.308744 + 0.6310225 + 1.483612)
print(probit_prediction)
## [1] 0.789847
probit_margins <- margins(pm, at = list(child = 1, female = 1, class1 = 0, class2 = 0))
print(probit_margins)
## Average marginal effects at specified values
## glm(formula = survive ~ child + female + class1 + class2, family = binomial(link = "probit"),     data = titanic)
##  at(child) at(female) at(class1) at(class2)  child female class1 class2
##          1          1          0          0 0.1813 0.4299 0.3131  0.141
# using logit model
logit_margins <- margins(lm, at = list(child = 1, female = 1, class1 = 0, class2 = 0))
print(logit_margins)
## Average marginal effects at specified values
## glm(formula = survive ~ child + female + class1 + class2, family = binomial(link = "logit"),     data = titanic)
##  at(child) at(female) at(class1) at(class2)  child female class1 class2
##          1          1          0          0 0.1886 0.4039 0.3072 0.1488