library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
if(!require("tufte")) install.packages("tufte")
## Loading required package: tufte
#library(tufte)
if(!require("foreign")) install.packages("foreign")
## Loading required package: foreign
library(foreign) #this will help when importing data that are not in csv.format(stata, sas, spss)
#library(haven)
if(!require("readstata13")) install.packages("readstata13")
## Loading required package: readstata13
library(readstata13)
rm(list = ls()) #clear the environment
We use function “rad.dta13” from the library “readstata13” which read the dta version of dataset from stata version 13 and above
Cps20 <- read.dta13("CPS_2020_monthly.dta")
Cps21<-read.dta13("CPS_2021_23_monthly.dta")
#quickly look at data structure
glimpse(Cps20)
#Summary statistic of all variables in the dataset
summary(Cps20)
prop.table(table(Cps20$month)) # to check the frequency distribution of the value of a variable of interest; shows proportion.
table(Cps20$month)#gives number of occurrence
head(Cps20$month) # see a specific variable from the dataset
is.na(Cps20$unemployed) # check missing data from a variable (return a true or false)
sum(is.na(Cps20$unemployed)) # find count of missing values
which(is.na(Cps20$unemployed)) #find the location of missing variable
# tabulate month
Cps20 %>%
count(month)%>%# we use count function
mutate(Percent=round(n/sum(n),4)*100)%>% # frequency rounded at 3
arrange(desc(Percent))%>% # arranged in descending order
mutate(Percentile = cumsum(Percent))
## month n Percent Percentile
## 1 february 80585 9.33 9.33
## 2 january 79880 9.25 18.58
## 3 october 77294 8.95 27.53
## 4 november 76243 8.83 36.36
## 5 september 75326 8.72 45.08
## 6 december 73545 8.52 53.60
## 7 march 71215 8.25 61.85
## 8 april 68734 7.96 69.81
## 9 august 67169 7.78 77.59
## 10 may 66201 7.67 85.26
## 11 july 64312 7.45 92.71
## 12 june 63003 7.30 100.01
# tab to view dummy variable unemployed created
Cps20%>%
group_by(empstat)%>% #group by empstat
summarise(mean_unemployed = mean(unemployed),
n =n()) # compute the mean of unemployed
## # A tibble: 8 × 3
## empstat mean_unemployed n
## <fct> <dbl> <int>
## 1 armed forces NA 4296
## 2 at work 0 528302
## 3 has job, not at work last week 0 23995
## 4 unemployed, experienced worker 1 41237
## 5 unemployed, new worker 1 1892
## 6 nilf, unable to work NA 45829
## 7 nilf, other NA 143806
## 8 nilf, retired NA 74150
Cps20<-Cps20 %>%
mutate(female=ifelse(sex=='female',1,0))
# tabulate female
Cps20%>%
count(female) %>%
mutate(percent = round(n/sum(n),4)*100)
## female n percent
## 1 0 421602 48.82
## 2 1 441905 51.18
# summarise female
Cps20%>%
summarise(Obs =n(),
mean_female = round(mean(female),3),
Std.dev. =round(sd(female),3),
min = min(female),
max = max(female))
## Obs mean_female Std.dev. min max
## 1 863507 0.512 0.5 0 1
# optional: calculating average unemployment rate by sex in each month
Cps20 %>%
group_by(month,sex) %>%
summarise(aver_unemployed =round(mean(unemployed, na.rm =TRUE),3))%>%
pivot_wider(names_from = sex,
values_from = aver_unemployed)%>%
ungroup()
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## month male female
## <fct> <dbl> <dbl>
## 1 january 0.042 0.035
## 2 february 0.042 0.032
## 3 march 0.046 0.04
## 4 april 0.124 0.146
## 5 may 0.109 0.129
## 6 june 0.096 0.108
## 7 july 0.087 0.101
## 8 august 0.075 0.08
## 9 september 0.071 0.071
## 10 october 0.063 0.059
## 11 november 0.062 0.056
## 12 december 0.062 0.057
Create a dummy variable “college”, which is equal to 1 if educ ( a person) has a bachelor’s degree, master’s degree, or a doctorate degree we use case_when function when we are treating more than one condition ( using more than one if-statement)
Cps20<-Cps20 %>%
mutate(college =case_when(Cps20$educ=="bachelor's degree"~1,
Cps20$educ=="master's degree"~1,
Cps20$educ=="doctorate degree"~1,
TRUE~0))
In stata, it’s only one line of code: tab month, sum(unemployed)
# number of employment and unemployment in each month:
table(Cps20$month, Cps20$unemployed)# tab month unemployed.
##
## 0 1
## january 53695 2152
## february 54488 2103
## march 47437 2137
## april 39895 6217
## may 39600 5309
## june 38973 4414
## july 40456 4169
## august 42957 3598
## september 48097 3674
## october 50043 3284
## november 49226 3083
## december 47430 2989
# or you can do it in this way:
unemployment<-Cps20%>%
filter(!is.na(month)& !is.na(unemployed))%>%
count(month, unemployed)%>%
pivot_wider(names_from = unemployed,
values_from = n) %>%
rename(unemployment_1="1",
employment_0="0")
unemployment
## # A tibble: 12 × 3
## month employment_0 unemployment_1
## <fct> <int> <int>
## 1 january 53695 2152
## 2 february 54488 2103
## 3 march 47437 2137
## 4 april 39895 6217
## 5 may 39600 5309
## 6 june 38973 4414
## 7 july 40456 4169
## 8 august 42957 3598
## 9 september 48097 3674
## 10 october 50043 3284
## 11 november 49226 3083
## 12 december 47430 2989
#Mean of unemployment rates in each month
Cps20%>%
group_by(month)%>%
#filter(month=="january")%>%
summarise(Obs = n(),
mean_unemployed = round(mean(unemployed, na.rm = TRUE), 3),
Std.dev. = round(sd(unemployed, na.rm=TRUE),3),
min =min(unemployed, na.rm = TRUE),
max =max(unemployed, na.rm = TRUE))
## # A tibble: 12 × 6
## month Obs mean_unemployed Std.dev. min max
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 january 79880 0.039 0.192 0 1
## 2 february 80585 0.037 0.189 0 1
## 3 march 71215 0.043 0.203 0 1
## 4 april 68734 0.135 0.342 0 1
## 5 may 66201 0.118 0.323 0 1
## 6 june 63003 0.102 0.302 0 1
## 7 july 64312 0.093 0.291 0 1
## 8 august 67169 0.077 0.267 0 1
## 9 september 75326 0.071 0.257 0 1
## 10 october 77294 0.062 0.24 0 1
## 11 november 76243 0.059 0.236 0 1
## 12 december 73545 0.059 0.236 0 1
#*To compute the mean at a specific month, use filter function.*
sum_female<-Cps20%>%
group_by(month,female) %>%
summarise(aver_unemployed = mean(unemployed, na.rm =TRUE))%>%
pivot_wider(names_from = female,
values_from = aver_unemployed)%>%
rename(unemp_male ="0",
unemp_female="1")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
sum_female
## # A tibble: 12 × 3
## # Groups: month [12]
## month unemp_male unemp_female
## <fct> <dbl> <dbl>
## 1 january 0.0420 0.0348
## 2 february 0.0415 0.0324
## 3 march 0.0462 0.0397
## 4 april 0.124 0.146
## 5 may 0.109 0.129
## 6 june 0.0964 0.108
## 7 july 0.0867 0.101
## 8 august 0.0746 0.0802
## 9 september 0.0705 0.0715
## 10 october 0.0635 0.0595
## 11 november 0.0617 0.0559
## 12 december 0.0617 0.0566
Plotting using R is optional,You can plot that in excel by using sum_female
ggplot(sum_female)+
geom_line(aes(month, unemp_male, group=1, color="male" ))+
geom_line(aes(month, unemp_female, group=1, color="female"))+
labs(
title = "Average unemployment rate accross gender",
subtitle = "in 2020",
x="Month",y="gender",
color=""
)
Cps20_1%>%
group_by(month, agecat)%>%
summarise(aver_unemp=mean(unemployed, na.rm = TRUE))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
## # A tibble: 72 × 3
## # Groups: month [12]
## month agecat aver_unemp
## <fct> <chr> <dbl>
## 1 january 0 0.129
## 2 january 1 0.0644
## 3 january 2 0.0346
## 4 january 3 0.0288
## 5 january 4 0.0279
## 6 january 5 0.0282
## 7 february 0 0.111
## 8 february 1 0.0594
## 9 february 2 0.0340
## 10 february 3 0.0295
## # ℹ 62 more rows
age_graph<-Cps20_1%>%
group_by(month, agecat)%>%
summarise(aver_unemp=mean(unemployed, na.rm = TRUE))%>%
pivot_wider(names_from = agecat,
values_from = aver_unemp)%>%
rename(cat0 ="0", cat1="1", cat2="2",cat3="3", cat4="4", cat5="5")%>%
ggplot()+
geom_line(aes(month, cat0, group=1, color="age<20"))+
geom_line(aes(month, cat1, group=1, color="between age 20 &30"))+
geom_line(aes(month, cat2, group=1, color="between age 30 &40"))+
geom_line(aes(month, cat3, group=1, color="between age 40 &50"))+
geom_line(aes(month, cat5, group=1, color="between age 50 &60"))+
geom_line(aes(month, cat5, group=1, color="age>60"))+
labs(
title = "Unemployment rates accross age category: 2020",
x="Months", y="Unemploymnt_rates",
color ="Age Groups"
)+
theme(axis.text.x = element_text(angle = 45, hjust = 0.1, vjust = 0))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
age_graph
Cps20_23<-bind_rows(Cps20, Cps21)
Cps20_23%>%
group_by(year)%>%
summarise(Obs =n(),
unemp_yrs=round(mean(unemployed, na.rm = TRUE),3),
Std.dev. = round(sd(unemployed, na.rm =TRUE),3),
min = min(unemployed, na.rm = TRUE),
max = max(unemployed,na.rm = TRUE)
)
## # A tibble: 4 × 6
## year Obs unemp_yrs Std.dev. min max
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2020 863507 0.072 0.259 0 1
## 2 2021 432672 0.05 0.218 0 1
## 3 2022 411969 0.034 0.182 0 1
## 4 2023 233241 0.035 0.183 0 1
Cps20_23%>%
group_by(year,month)%>%
summarise(unemp_yrs=mean(unemployed, na.rm = TRUE))%>%
pivot_wider(names_from = year,
values_from = unemp_yrs)%>%
rename(yr_2020="2020", yr_2021="2021", yr_2022="2022", yr_2023="2023")%>%
ggplot()+
geom_line(aes(month, yr_2020, group=1, color="2020"))+
geom_line(aes(month, yr_2021, group=1, color="2021"))+
geom_line(aes(month, yr_2022, group=1, color="2022"))+
geom_line(aes(month, yr_2023, group=1, color="2023"))+
labs(
title = "Unemployment rates",
y="Unemployment_rates", x="Months",
color = "Years"
)+
theme(axis.text.x = element_text(angle = 45, hjust = 0.1, vjust = 0))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).
#this is the end! thank you.