Important packages/libraries for this lab:

  1. tidyverse
  2. foreign: used when reading data that is not in .CSV format
  3. readstata13 : used when reading data in dta format
library(tidyverse) 
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
if(!require("tufte")) install.packages("tufte")
## Loading required package: tufte
#library(tufte)

if(!require("foreign")) install.packages("foreign")
## Loading required package: foreign
library(foreign) #this will help when importing data that are not in csv.format(stata, sas, spss)
#library(haven)

if(!require("readstata13")) install.packages("readstata13")
## Loading required package: readstata13
library(readstata13)

rm(list = ls()) #clear the environment

I. Loading the data set in R.

We use function “rad.dta13” from the library “readstata13” which read the dta version of dataset from stata version 13 and above

Cps20 <- read.dta13("CPS_2020_monthly.dta")
Cps21<-read.dta13("CPS_2021_23_monthly.dta")

III. Looking at the data: Describe, Browse, Summarize, Tabulate

7.

#quickly look at data structure
glimpse(Cps20) 

#Summary statistic of all variables in the dataset
summary(Cps20) 

prop.table(table(Cps20$month)) # to check the frequency distribution of the value of a variable of interest; shows proportion.
table(Cps20$month)#gives number of occurrence

head(Cps20$month) # see a specific variable from the dataset

is.na(Cps20$unemployed) # check missing data from a variable (return a true or false)
sum(is.na(Cps20$unemployed)) # find count of missing values
which(is.na(Cps20$unemployed)) #find the location of missing variable

8. Counting number of occurence of a variable

# tabulate month
Cps20 %>%
  count(month)%>%# we use count function
  mutate(Percent=round(n/sum(n),4)*100)%>% # frequency rounded at 3
  arrange(desc(Percent))%>% # arranged in descending order
  mutate(Percentile = cumsum(Percent))
##        month     n Percent Percentile
## 1   february 80585    9.33       9.33
## 2    january 79880    9.25      18.58
## 3    october 77294    8.95      27.53
## 4   november 76243    8.83      36.36
## 5  september 75326    8.72      45.08
## 6   december 73545    8.52      53.60
## 7      march 71215    8.25      61.85
## 8      april 68734    7.96      69.81
## 9     august 67169    7.78      77.59
## 10       may 66201    7.67      85.26
## 11      july 64312    7.45      92.71
## 12      june 63003    7.30     100.01

IV. Generating new variables; use mutate to add a new variable to the dataset

10.

# tab to view dummy variable unemployed created
Cps20%>%
  group_by(empstat)%>% #group by empstat
  summarise(mean_unemployed = mean(unemployed),
            n =n()) # compute the mean of unemployed
## # A tibble: 8 × 3
##   empstat                        mean_unemployed      n
##   <fct>                                    <dbl>  <int>
## 1 armed forces                                NA   4296
## 2 at work                                      0 528302
## 3 has job, not at work last week               0  23995
## 4 unemployed, experienced worker               1  41237
## 5 unemployed, new worker                       1   1892
## 6 nilf, unable to work                        NA  45829
## 7 nilf, other                                 NA 143806
## 8 nilf, retired                               NA  74150

11. create a dummy variable female

Cps20<-Cps20 %>%
  mutate(female=ifelse(sex=='female',1,0))

12. Compute the mean of the female variable

# tabulate female
Cps20%>%
  count(female) %>%
  mutate(percent = round(n/sum(n),4)*100)
##   female      n percent
## 1      0 421602   48.82
## 2      1 441905   51.18
# summarise female
Cps20%>%
  summarise(Obs =n(),
            mean_female = round(mean(female),3),
            Std.dev. =round(sd(female),3),
            min = min(female),
            max = max(female))
##      Obs mean_female Std.dev. min max
## 1 863507       0.512      0.5   0   1
# optional: calculating average unemployment rate by sex in each month
Cps20 %>% 
  group_by(month,sex) %>%
  summarise(aver_unemployed =round(mean(unemployed, na.rm =TRUE),3))%>%
  pivot_wider(names_from = sex,
              values_from = aver_unemployed)%>%
  ungroup()
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
##    month      male female
##    <fct>     <dbl>  <dbl>
##  1 january   0.042  0.035
##  2 february  0.042  0.032
##  3 march     0.046  0.04 
##  4 april     0.124  0.146
##  5 may       0.109  0.129
##  6 june      0.096  0.108
##  7 july      0.087  0.101
##  8 august    0.075  0.08 
##  9 september 0.071  0.071
## 10 october   0.063  0.059
## 11 november  0.062  0.056
## 12 december  0.062  0.057

13. Generate a dummy variable ‘college’ for at least having 4-year college degree.

Create a dummy variable “college”, which is equal to 1 if educ ( a person) has a bachelor’s degree, master’s degree, or a doctorate degree we use case_when function when we are treating more than one condition ( using more than one if-statement)

Cps20<-Cps20 %>%
  mutate(college =case_when(Cps20$educ=="bachelor's degree"~1,
                           Cps20$educ=="master's degree"~1,
                           Cps20$educ=="doctorate degree"~1,
                           TRUE~0))  

Visualizing unemployment accross years(2020-2023)

Cps20_23%>%
  group_by(year,month)%>%
  summarise(unemp_yrs=mean(unemployed, na.rm = TRUE))%>%
  pivot_wider(names_from = year, 
              values_from = unemp_yrs)%>%
  rename(yr_2020="2020", yr_2021="2021", yr_2022="2022", yr_2023="2023")%>%
  ggplot()+
  geom_line(aes(month, yr_2020, group=1, color="2020"))+
  geom_line(aes(month, yr_2021, group=1, color="2021"))+
  geom_line(aes(month, yr_2022, group=1, color="2022"))+
  geom_line(aes(month, yr_2023, group=1, color="2023"))+
  labs(
    title = "Unemployment rates",
    y="Unemployment_rates", x="Months",
    color = "Years"
  )+
  theme(axis.text.x = element_text(angle = 45, hjust = 0.1, vjust = 0))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).

#this is the end! thank you.