Important packages (libraries) for this lab:

  1. tidyverse: Contain libraries such as dplyr and ggplot, which are instrumental here
  2. purrr

load necessary libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
if(!require("purrr")) install.packages("purrr")
library(purrr) #this will help in summarizing the variable

rm(list = ls()) #clear the environment

Load the data

usa21<-read_csv("IPUMS_USA_2021_ACS.csv")
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 325718 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): sample, gq, sex, age, educ, educd, incwage
## dbl (9): year, serial, cbserial, hhwt, cluster, strata, pernum, perwt, incearn
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# make some variables numeric
columns <-c("incwage", "incearn", "year")
usa21[, columns] <- lapply(columns, function(x) as.numeric(usa21[[x]]))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

4. Looking at the earning variables

Creating a function that can take in multiple arguments(variable) and return a table of the variables with their summary statistics. We use function enquos to capture multiple variables as a list of quosures. We use map_dfr function from the package purrr to iterate over each variable, calculate summary statistics, and combine the results into a single data frame.

nv_sum<-function(datax,...){
  vars<-enquos(...) # capture multiple variables and assign them to the variable vars
  map_dfr(vars, function(varx){ # iterate over the variables in vars to compute summary statistics
  var_name<-as_label(varx)
  datax%>%
    summarise(variable =var_name,
              min =min({{varx}}, na.rm = TRUE),
              mean =mean({{varx}}, na.rm=TRUE),
              median =median(!!varx, na.rm =TRUE),
              st.dev =sd(!!varx, na.rm =TRUE),
              max=max(!!varx, na.rm = TRUE),
              n =n()
    ) 
  })
}
# varx can be either between {{}} or start with !! to ensure that the column name passed to the function is correctly referenced within summarise
nv_sum(usa21, incwage, incearn)
## # A tibble: 2 × 7
##   variable   min   mean median st.dev     max      n
##   <chr>    <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <int>
## 1 incwage      0 32593.   7000 60835.  787000 325718
## 2 incearn  -9700 28897.    300 59948. 1174000 325718
filtered_data<-usa21%>%
  filter(age>=25 & age<=60)

nv_sum(filtered_data, incwage) # call the nv_sum function created already
## # A tibble: 1 × 7
##   variable   min   mean median st.dev    max      n
##   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <int>
## 1 incwage      0 49306.  34000 69753. 770000 157079

We only want to look at people with positive earnings (for now) so set all others to missing set our earnings variable to missing if respondent has no employee income and/or is not in university for incomewage

usa21$incwage[usa21$incwage==0]<-NA #replacing zeros in the incwage variable with NAs
usa21_0<-usa21 %>% 
  na.omit() # get rid of all missing values 

6. Look for an education variable and recorde

table(usa21_0$educ)
## 
##         1 year of college        2 years of college        4 years of college 
##                     21314                     14166                     35767 
##       5+ years of college                  grade 10                  grade 11 
##                     23427                      2043                      3101 
##                  grade 12       grade 5, 6, 7, or 8                   grade 9 
##                     49339                      1881                      1144 
##       n/a or no schooling nursery school to grade 4 
##                      1897                       369
table(usa21_0$educd)
## 
##   1 or more years of college credit, no degree 
##                                          21314 
##                         12th grade, no diploma 
##                                           2460 
##         associate's degree, type not specified 
##                                          14166 
##                              bachelor's degree 
##                                          35767 
##                                doctoral degree 
##                                           2951 
##                  ged or alternative credential 
##                                           4870 
##                                        grade 1 
##                                             45 
##                                       grade 10 
##                                           2043 
##                                       grade 11 
##                                           3101 
##                                        grade 2 
##                                             51 
##                                        grade 3 
##                                            119 
##                                        grade 4 
##                                             98 
##                                        grade 5 
##                                            215 
##                                        grade 6 
##                                            761 
##                                        grade 7 
##                                            173 
##                                        grade 8 
##                                            732 
##                                        grade 9 
##                                           1144 
##                                   kindergarten 
##                                             28 
##                                master's degree 
##                                          16474 
##                         no schooling completed 
##                                           1897 
##                      nursery school, preschool 
##                                             28 
## professional degree beyond a bachelor's degree 
##                                           4002 
##                    regular high school diploma 
##                                          31342 
##             some college, but less than 1 year 
##                                          10667
 #or
usa21_0%>%
  count(educ)
## # A tibble: 11 × 2
##    educ                          n
##    <chr>                     <int>
##  1 1 year of college         21314
##  2 2 years of college        14166
##  3 4 years of college        35767
##  4 5+ years of college       23427
##  5 grade 10                   2043
##  6 grade 11                   3101
##  7 grade 12                  49339
##  8 grade 5, 6, 7, or 8        1881
##  9 grade 9                    1144
## 10 n/a or no schooling        1897
## 11 nursery school to grade 4   369

Generate numeric values of the values in variable educd

usa21_0<-usa21_0%>%
  mutate(educCode=with(usa21_0, match(educd, unique(educd))))

or the same thing can be achieved by: usa21_0\(educCode = as.integer(factor(usa21_0\)educd, levels = unique(usa21_0$educd)))

Summarize educd with educCode to see the matches

# or we can group by educd and use the new dataset as an argument in our function nv_sum().
grouped_data<-usa21_0%>%
  group_by(educd)

nv_sum(grouped_data, educCode)
## # A tibble: 24 × 8
##    educd                          variable   min  mean median st.dev   max     n
##    <chr>                          <chr>    <int> <dbl>  <dbl>  <dbl> <int> <int>
##  1 1 or more years of college cr… educCode     2     2      2      0     2 21314
##  2 12th grade, no diploma         educCode    13    13     13      0    13  2460
##  3 associate's degree, type not … educCode     8     8      8      0     8 14166
##  4 bachelor's degree              educCode    11    11     11      0    11 35767
##  5 doctoral degree                educCode     9     9      9      0     9  2951
##  6 ged or alternative credential  educCode     7     7      7      0     7  4870
##  7 grade 1                        educCode    24    24     24      0    24    45
##  8 grade 10                       educCode     3     3      3      0     3  2043
##  9 grade 11                       educCode     6     6      6      0     6  3101
## 10 grade 2                        educCode    22    22     22      0    22    51
## # ℹ 14 more rows

Generating a new variables “edyears” with numbers of years of education

usa21_1<- usa21_0%>% 
   mutate(edyearF =factor(educCode, c(2, 13, 8, 11, 9,7,24,3,6,22, 18, 19, 21, 16, 5, 17, 15, 20, 10, 14, 23, 12, 1, 4 ), 
 labels=c('13','11','14','16','20','12','1','10','11','2','3','4','5','6','7','8','9','0','18','0','0','18','12','13')),
 edyears = as.numeric(as.character(edyearF)))
 
usa21_1
## # A tibble: 154,448 × 19
##     year sample   serial  cbserial  hhwt cluster strata gq    pernum perwt sex  
##    <dbl> <chr>     <dbl>     <dbl> <dbl>   <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>
##  1  2021 2021 acs     41   2.02e12   460 2.02e12 200001 othe…      1   460 fema…
##  2  2021 2021 acs     51   2.02e12   420 2.02e12  10001 othe…      1   420 fema…
##  3  2021 2021 acs     61   2.02e12   590 2.02e12 190001 othe…      1   590 male 
##  4  2021 2021 acs     71   2.02e12    30 2.02e12 140001 othe…      1    30 male 
##  5  2021 2021 acs     81   2.02e12   570 2.02e12  30201 othe…      1   570 fema…
##  6  2021 2021 acs    141   2.02e12   650 2.02e12 240001 othe…      1   650 fema…
##  7  2021 2021 acs    201   2.02e12   520 2.02e12  40001 grou…      1   520 male 
##  8  2021 2021 acs    261   2.02e12   650 2.02e12 110001 grou…      1   650 male 
##  9  2021 2021 acs    291   2.02e12   690 2.02e12 130201 othe…      1   690 fema…
## 10  2021 2021 acs    321   2.02e12   500 2.02e12 160001 othe…      1   500 fema…
## # ℹ 154,438 more rows
## # ℹ 8 more variables: age <chr>, educ <chr>, educd <chr>, incwage <dbl>,
## #   incearn <dbl>, educCode <int>, edyearF <fct>, edyears <dbl>

Check if the match was successful

usa21_1%>%
 group_by(educd) %>% 
  summarise(means = mean(edyears))
## # A tibble: 24 × 2
##    educd                                        means
##    <chr>                                        <dbl>
##  1 1 or more years of college credit, no degree    13
##  2 12th grade, no diploma                          11
##  3 associate's degree, type not specified          14
##  4 bachelor's degree                               16
##  5 doctoral degree                                 20
##  6 ged or alternative credential                   12
##  7 grade 1                                          1
##  8 grade 10                                        10
##  9 grade 11                                        11
## 10 grade 2                                          2
## # ℹ 14 more rows

7. Relationship between wage and years of education

edyear_usa21_1<-usa21_1%>%
  group_by(edyears)
nv_sum(edyear_usa21_1, incwage) # call the function nv_sum()
## # A tibble: 18 × 8
##    edyears variable   min    mean median  st.dev    max     n
##      <dbl> <chr>    <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <int>
##  1       0 incwage      4  34666.  27000  42854. 634000  1953
##  2       1 incwage    670  27814.  25000  18635.  80000    45
##  3       2 incwage   1600  24127.  25000  14467.  70000    51
##  4       3 incwage    200  29781.  24000  27434. 156000   119
##  5       4 incwage    190  26284.  24400  21431. 159000    98
##  6       5 incwage    290  28754.  24000  23036. 160000   215
##  7       6 incwage    200  30654.  26300  21928. 192000   761
##  8       7 incwage     50  32335.  27000  30318. 260000   173
##  9       8 incwage     30  33461.  25000  44274. 554000   732
## 10       9 incwage      4  26928.  22000  35075. 568000  1144
## 11      10 incwage     20  18806.   9000  27670. 502000  2043
## 12      11 incwage      4  22773.  13000  32042. 627000  5561
## 13      12 incwage      4  36306.  30000  38667. 682000 36212
## 14      13 incwage     10  40204.  30000  45806. 770000 31981
## 15      14 incwage     20  48132.  40000  42930. 634000 14166
## 16      16 incwage      4  75868.  58000  80267. 787000 35767
## 17      18 incwage     30 105090.  75000 110291. 770000 20476
## 18      20 incwage     20 116607.  90000 108714. 682000  2951
gredyear_usa21_1<-usa21_1%>%
  group_by(edyears)%>%
  filter(age>=25 & age<=60) %>% 
  summarise(mean_wage=mean(incwage) )

8. Compare average earnings for college and non-college grads

usa21_1%>%
  mutate(college_grads =ifelse(edyears>=16,1,0))%>%
  group_by(college_grads)%>%
  filter(age>=25 & age<=60)%>%
  summarise(mean_wage=mean(incwage) )
## # A tibble: 2 × 2
##   college_grads mean_wage
##           <dbl>     <dbl>
## 1             0    44707.
## 2             1    91904.

9.Graphs

usa21_1%>%
  filter(age>=25 & age<=60)%>%
  ggplot(aes(edyears, incwage))+
           geom_point()+
  labs(
    title = "Scatter Plot of Education vs. Earnings",
    subtitle="2021 American Community Survey",
    x="Years of education",y="wage($)"
  )

usa21_1%>%
  filter(age>=25 & age<=60)%>%
  ggplot(aes(edyears, incwage))+
           geom_point()+
  geom_smooth(method=lm, se =FALSE)+
  labs(
    title = "Scatter Plot of Education vs. Earnings",
    subtitle="2021 American Community Survey",
    x="Years of education",y="wage($)"
  )
## `geom_smooth()` using formula = 'y ~ x'