###-------------------------------### ### BRM Session 3 - Data cleaning ### ###-------------------------------### # Dennis Abel and Lukas Birkenmaier # Intro and setup --------------------------------------------------------- # In this session we will prepare the Oddjob Airways dataset and prepare it for further analysis. # This basically means that we will subset the dataframe to those variables which we want to use # in the next sessions. All others can be excluded. We also want to check that all variables are # classified correctly as numerical or character (factor) variables. We will also assign the value # labels from the codebook to the factor variables. We also want to use this exercise to show how # all steps can be performed with base R as well as tidyverse. We generally advise you to use the # tidy-approaches but it is still helpful to know how base R works. # Load required packages library(tidyverse) # Working directory setwd("[insert directory here]") # Use read_csv-function to load CSV data oddjob <- read_csv("oddjob.csv") ################################################################################################ ## Exercise 1: Screen through the dataset and codebook and familiarize yourself with the dataset. ## Let's think about the measurement levels. How are these variables measured? ################################################################################################ # Get a feeling for the dataset head(oddjob) # first few observations, top of dataset dim(oddjob) # the dimensions of the dataset (rows by columns) colnames(oddjob) # column (generally variable) names # Data cleaning ----------------------------------------------------------- # In a first step, we will perform all relevant steps to "clean" the dataset and prepare # it for further processing. We will subset the dataset to those variables which we would like to # use in the next sessions. # We include the following variables # age (numerical) # gender (factor) # language (factor) # country (factor) # flight_class (factor) # flight_purpose (factor) # flight_type (factor) # nflights (numerical) # status (factor) # nps (numerical) # reputation (numerical) # overall_sat (numerical) # commitment (numerical) # Let's consult the codebook once more in order to understand what these variables measure # Subsetting the dataset is a good example to show that there are often many approaches available # to achieve the same results. Base R offers the "subset"-function which we can use to subset our # dataset to the selected variables. oddjob_sub <- subset(oddjob, select=c(age, gender, language, country, flight_class, flight_purpose, flight_type, nflights, status, nps, reputation, overall_sat, commitment)) # We learned in the last session, that dplyr also offers the "select"-function. The results # are the same. oddjob_sub <- oddjob |> select(age, gender, language, country, flight_class, flight_purpose, flight_type, nflights, status, nps, reputation, overall_sat, commitment) # The numerical variables were correctly classified by R. The factor variables are measured with # numerical values. That is why R assumes that these variables are numerical. Let's transform these # variables to factor variables and assign the factor labels. # We want to show you once more how you can approach this in base R. Afterwards, we will show you # the tidy-approach. # In base R, we have to access each variables individually. # Gender: nominal two factors oddjob_sub$gender <- factor(oddjob_sub$gender, levels=c(1,2), labels=c("female","male")) # Language: nominal three factors oddjob_sub$language <- factor(oddjob_sub$language, levels=c(1,2,3), labels=c("German","English","French")) # Country: nominal five factors oddjob_sub$country <- factor(oddjob_sub$country, levels=c(1,2,3,4,5), labels=c("Germany","Switzerland","Austria","France","USA")) # Flight class: ordinal three factors oddjob_sub$flight_class <- factor(oddjob_sub$flight_class, ordered=TRUE, levels=c(1,2,3), labels=c("First","Business","Economy")) # Flight purpose: nominal two factors oddjob_sub$flight_purpose <- factor(oddjob_sub$flight_purpose, levels=c(1,2), labels=c("Business","Leisure")) # Flight type: nominal two factors oddjob_sub$flight_type <- factor(oddjob_sub$flight_type, levels=c(1,2), labels=c("Domestic","International")) # Traveler's status: ordinal three factors oddjob_sub$status <- factor(oddjob_sub$status, ordered=TRUE, levels=c(1,2,3), labels=c("blue","silver","gold")) # The tidy-approach reduces these single steps into one command - it looks much more complex but # is more efficient and minimizes error risks. oddjob_sub <- oddjob_sub |> mutate(gender = factor(gender, levels=c(1,2), labels=c("female","male")), language = factor(language, levels=c(1,2,3), labels=c("German","English","French")), country = factor(country, levels=c(1,2,3,4,5), labels=c("Germany","Switzerland","Austria","France","USA")), flight_class = factor(flight_class, ordered=TRUE, levels=c(1,2,3), labels=c("First","Business","Economy")), flight_purpose = factor(flight_purpose, levels=c(1,2), labels=c("Business","Leisure")), flight_type = factor(flight_type, levels=c(1,2), labels=c("Domestic","International")), status = factor(status, ordered=TRUE, levels=c(1,2,3), labels=c("blue","silver","gold")) ) # Now, our data frame looks clean and is easily readable. # Checking the codebook and assigning the labels was quite a bit of manual labour (either way). # So next time we want to work with our dataset, we do not want to do that again. # We can store our dataframe as an RDS-object, which is R's custom binary format. # That way, we can always load it in a new session. write_rds(oddjob_sub, "oddjob.rds")