library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
if(!require("purrr")) install.packages("purrr")
library(purrr) #this will help in summarizing the variable
rm(list = ls()) #clear the environment
usa21<-read_csv("IPUMS_USA_2021_ACS.csv")
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 325718 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): sample, gq, sex, age, educ, educd, incwage
## dbl (9): year, serial, cbserial, hhwt, cluster, strata, pernum, perwt, incearn
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# make some variables numeric
columns <-c("incwage", "incearn", "year")
usa21[, columns] <- lapply(columns, function(x) as.numeric(usa21[[x]]))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
Creating a function that can take in multiple arguments(variable) and return a table of the variables with their summary statistics. We use function enquos to capture multiple variables as a list of quosures. We use map_dfr function from the package purrr to iterate over each variable, calculate summary statistics, and combine the results into a single data frame.
nv_sum<-function(datax,...){
vars<-enquos(...) # capture multiple variables and assign them to the variable vars
map_dfr(vars, function(varx){ # iterate over the variables in vars to compute summary statistics
var_name<-as_label(varx)
datax%>%
summarise(variable =var_name,
min =min({{varx}}, na.rm = TRUE),
mean =mean({{varx}}, na.rm=TRUE),
median =median(!!varx, na.rm =TRUE),
st.dev =sd(!!varx, na.rm =TRUE),
max=max(!!varx, na.rm = TRUE),
n =n()
)
})
}
# varx can be either between {{}} or start with !! to ensure that the column name passed to the function is correctly referenced within summarise
nv_sum(usa21, incwage, incearn)
## # A tibble: 2 × 7
## variable min mean median st.dev max n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 incwage 0 32593. 7000 60835. 787000 325718
## 2 incearn -9700 28897. 300 59948. 1174000 325718
filtered_data<-usa21%>%
filter(age>=25 & age<=60)
nv_sum(filtered_data, incwage) # call the nv_sum function created already
## # A tibble: 1 × 7
## variable min mean median st.dev max n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 incwage 0 49306. 34000 69753. 770000 157079
We only want to look at people with positive earnings (for now) so set all others to missing set our earnings variable to missing if respondent has no employee income and/or is not in university for incomewage
usa21$incwage[usa21$incwage==0]<-NA #replacing zeros in the incwage variable with NAs
usa21_0<-usa21 %>%
na.omit() # get rid of all missing values
table(usa21_0$educ)
##
## 1 year of college 2 years of college 4 years of college
## 21314 14166 35767
## 5+ years of college grade 10 grade 11
## 23427 2043 3101
## grade 12 grade 5, 6, 7, or 8 grade 9
## 49339 1881 1144
## n/a or no schooling nursery school to grade 4
## 1897 369
table(usa21_0$educd)
##
## 1 or more years of college credit, no degree
## 21314
## 12th grade, no diploma
## 2460
## associate's degree, type not specified
## 14166
## bachelor's degree
## 35767
## doctoral degree
## 2951
## ged or alternative credential
## 4870
## grade 1
## 45
## grade 10
## 2043
## grade 11
## 3101
## grade 2
## 51
## grade 3
## 119
## grade 4
## 98
## grade 5
## 215
## grade 6
## 761
## grade 7
## 173
## grade 8
## 732
## grade 9
## 1144
## kindergarten
## 28
## master's degree
## 16474
## no schooling completed
## 1897
## nursery school, preschool
## 28
## professional degree beyond a bachelor's degree
## 4002
## regular high school diploma
## 31342
## some college, but less than 1 year
## 10667
#or
usa21_0%>%
count(educ)
## # A tibble: 11 × 2
## educ n
## <chr> <int>
## 1 1 year of college 21314
## 2 2 years of college 14166
## 3 4 years of college 35767
## 4 5+ years of college 23427
## 5 grade 10 2043
## 6 grade 11 3101
## 7 grade 12 49339
## 8 grade 5, 6, 7, or 8 1881
## 9 grade 9 1144
## 10 n/a or no schooling 1897
## 11 nursery school to grade 4 369
usa21_0<-usa21_0%>%
mutate(educCode=with(usa21_0, match(educd, unique(educd))))
or the same thing can be achieved by: usa21_0\(educCode = as.integer(factor(usa21_0\)educd, levels = unique(usa21_0$educd)))
# or we can group by educd and use the new dataset as an argument in our function nv_sum().
grouped_data<-usa21_0%>%
group_by(educd)
nv_sum(grouped_data, educCode)
## # A tibble: 24 × 8
## educd variable min mean median st.dev max n
## <chr> <chr> <int> <dbl> <dbl> <dbl> <int> <int>
## 1 1 or more years of college cr… educCode 2 2 2 0 2 21314
## 2 12th grade, no diploma educCode 13 13 13 0 13 2460
## 3 associate's degree, type not … educCode 8 8 8 0 8 14166
## 4 bachelor's degree educCode 11 11 11 0 11 35767
## 5 doctoral degree educCode 9 9 9 0 9 2951
## 6 ged or alternative credential educCode 7 7 7 0 7 4870
## 7 grade 1 educCode 24 24 24 0 24 45
## 8 grade 10 educCode 3 3 3 0 3 2043
## 9 grade 11 educCode 6 6 6 0 6 3101
## 10 grade 2 educCode 22 22 22 0 22 51
## # ℹ 14 more rows
usa21_1<- usa21_0%>%
mutate(edyearF =factor(educCode, c(2, 13, 8, 11, 9,7,24,3,6,22, 18, 19, 21, 16, 5, 17, 15, 20, 10, 14, 23, 12, 1, 4 ),
labels=c('13','11','14','16','20','12','1','10','11','2','3','4','5','6','7','8','9','0','18','0','0','18','12','13')),
edyears = as.numeric(as.character(edyearF)))
usa21_1
## # A tibble: 154,448 × 19
## year sample serial cbserial hhwt cluster strata gq pernum perwt sex
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
## 1 2021 2021 acs 41 2.02e12 460 2.02e12 200001 othe… 1 460 fema…
## 2 2021 2021 acs 51 2.02e12 420 2.02e12 10001 othe… 1 420 fema…
## 3 2021 2021 acs 61 2.02e12 590 2.02e12 190001 othe… 1 590 male
## 4 2021 2021 acs 71 2.02e12 30 2.02e12 140001 othe… 1 30 male
## 5 2021 2021 acs 81 2.02e12 570 2.02e12 30201 othe… 1 570 fema…
## 6 2021 2021 acs 141 2.02e12 650 2.02e12 240001 othe… 1 650 fema…
## 7 2021 2021 acs 201 2.02e12 520 2.02e12 40001 grou… 1 520 male
## 8 2021 2021 acs 261 2.02e12 650 2.02e12 110001 grou… 1 650 male
## 9 2021 2021 acs 291 2.02e12 690 2.02e12 130201 othe… 1 690 fema…
## 10 2021 2021 acs 321 2.02e12 500 2.02e12 160001 othe… 1 500 fema…
## # ℹ 154,438 more rows
## # ℹ 8 more variables: age <chr>, educ <chr>, educd <chr>, incwage <dbl>,
## # incearn <dbl>, educCode <int>, edyearF <fct>, edyears <dbl>
usa21_1%>%
group_by(educd) %>%
summarise(means = mean(edyears))
## # A tibble: 24 × 2
## educd means
## <chr> <dbl>
## 1 1 or more years of college credit, no degree 13
## 2 12th grade, no diploma 11
## 3 associate's degree, type not specified 14
## 4 bachelor's degree 16
## 5 doctoral degree 20
## 6 ged or alternative credential 12
## 7 grade 1 1
## 8 grade 10 10
## 9 grade 11 11
## 10 grade 2 2
## # ℹ 14 more rows
edyear_usa21_1<-usa21_1%>%
group_by(edyears)
nv_sum(edyear_usa21_1, incwage) # call the function nv_sum()
## # A tibble: 18 × 8
## edyears variable min mean median st.dev max n
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0 incwage 4 34666. 27000 42854. 634000 1953
## 2 1 incwage 670 27814. 25000 18635. 80000 45
## 3 2 incwage 1600 24127. 25000 14467. 70000 51
## 4 3 incwage 200 29781. 24000 27434. 156000 119
## 5 4 incwage 190 26284. 24400 21431. 159000 98
## 6 5 incwage 290 28754. 24000 23036. 160000 215
## 7 6 incwage 200 30654. 26300 21928. 192000 761
## 8 7 incwage 50 32335. 27000 30318. 260000 173
## 9 8 incwage 30 33461. 25000 44274. 554000 732
## 10 9 incwage 4 26928. 22000 35075. 568000 1144
## 11 10 incwage 20 18806. 9000 27670. 502000 2043
## 12 11 incwage 4 22773. 13000 32042. 627000 5561
## 13 12 incwage 4 36306. 30000 38667. 682000 36212
## 14 13 incwage 10 40204. 30000 45806. 770000 31981
## 15 14 incwage 20 48132. 40000 42930. 634000 14166
## 16 16 incwage 4 75868. 58000 80267. 787000 35767
## 17 18 incwage 30 105090. 75000 110291. 770000 20476
## 18 20 incwage 20 116607. 90000 108714. 682000 2951
gredyear_usa21_1<-usa21_1%>%
group_by(edyears)%>%
filter(age>=25 & age<=60) %>%
summarise(mean_wage=mean(incwage) )
usa21_1%>%
mutate(college_grads =ifelse(edyears>=16,1,0))%>%
group_by(college_grads)%>%
filter(age>=25 & age<=60)%>%
summarise(mean_wage=mean(incwage) )
## # A tibble: 2 × 2
## college_grads mean_wage
## <dbl> <dbl>
## 1 0 44707.
## 2 1 91904.
usa21_1%>%
filter(age>=25 & age<=60)%>%
ggplot(aes(edyears, incwage))+
geom_point()+
labs(
title = "Scatter Plot of Education vs. Earnings",
subtitle="2021 American Community Survey",
x="Years of education",y="wage($)"
)
usa21_1%>%
filter(age>=25 & age<=60)%>%
ggplot(aes(edyears, incwage))+
geom_point()+
geom_smooth(method=lm, se =FALSE)+
labs(
title = "Scatter Plot of Education vs. Earnings",
subtitle="2021 American Community Survey",
x="Years of education",y="wage($)"
)
## `geom_smooth()` using formula = 'y ~ x'