### Safe-D binomial logistic regression model
### The purpose is to predict the probability of a driver speeding X% of free-flow duration for given variables listed.

###------------------------------------------ 
####### 1. Install and load libraries ####### 
###------------------------------------------ 

install.packages("dplyr")
install.packages("GGally")
install.packages("gmodels")
install.packages("estimatr")
install.packages("sandwich")
install.packages("multiwayvcov")
install.packages("miceadds")

library("dplyr")
library("GGally")
library("gmodels")
library("sandwich")
library("miceadds")
library("multiwayvcov")
library("estimatr")

###----------------------------------------
####### 2. Set up working directory ####### 
###---------------------------------------- 

### Set working directory to the folder that contains the csv files
setwd("C:/SharePoint/TxDOT Travel Survey Program - Docum/Safe-D Data/Trip Summary Data")

###---------------------------------
####### 3. Load input files ####### 
###---------------------------------

### Create list of csv files names
filenames <- list.files(full.names = TRUE)

### Merge all csv files into a common dataframe
myMergedData <- do.call(rbind, lapply(filenames, read.csv, header = TRUE))

###-------------------------------------------
####### 4. Prepare tables for analysis ####### 
###-------------------------------------------

### 1. Convert NAs to 0s for trip summary variables
myMergedData$s_trip_dur[is.na(myMergedData$s_trip_dur)] <- 0
myMergedData$s_free_flow_dur[is.na(myMergedData$s_free_flow_dur)] <- 0
myMergedData$s_speeding_dur[is.na(myMergedData$s_speeding_dur)] <- 0
myMergedData$s_trip_dist[is.na(myMergedData$s_trip_dist)] <- 0
myMergedData$s_free_flow_dist[is.na(myMergedData$s_free_flow_dist)] <- 0
myMergedData$s_speeding_dist[is.na(myMergedData$s_speeding_dist)] <- 0
myMergedData$s_speeding_tt_dur_pct[is.na(myMergedData$s_speeding_tt_dur_pct)] <- 0
myMergedData$s_speeding_ff_dur_pct[is.na(myMergedData$s_speeding_ff_dur_pct)] <- 0
myMergedData$s_speeding_tt_dist_pct[is.na(myMergedData$s_speeding_tt_dist_pct)] <- 0
myMergedData$s_speeding_ff_dist_pct[is.na(myMergedData$s_speeding_ff_dist_pct)] <- 0


### 2. Add and code new model variables
myMergedData["unique_driver_id"] <- paste(myMergedData$study_area, myMergedData$sample, myMergedData$per_num, sep = "_")

myMergedData["speeding_10pct_ff_dur"] <- ifelse(myMergedData$s_speeding_ff_dur_pct >= .10, 1, 0)
myMergedData["speeding_20pct_ff_dur"] <- ifelse(myMergedData$s_speeding_ff_dur_pct >= .20, 1, 0)

myMergedData["trip_occupancy_type"] <- ifelse(myMergedData$trp_per_count == 1, 0,ifelse(myMergedData$trp_per_count == 2, 1, ifelse(myMergedData$trp_per_count > 2, 2, 99)))  ### 0 = 0 passengers, 1 = 1 passengers, and 2 = 2 or more persons
myMergedData["child_occupancy_type"] <- ifelse(myMergedData$child_count == 1, 1, ifelse(myMergedData$child_count > 1, 2, 0))  ### 0 = 0 children, 1 = 1 child, and 2 = 2 or more children

myMergedData["trash_flag"] <- ifelse(myMergedData$trp_per_count == 0, 1, 0)

myMergedData["ethnicity_code"] <- ifelse(myMergedData$drivers_ethnicity == 5, 0, # White    # REFERENT
                                         ifelse(myMergedData$drivers_ethnicity == 1, 1, # Black
                                                ifelse(myMergedData$drivers_ethnicity == 2, 2, # Hispanic
                                                       ifelse(myMergedData$drivers_ethnicity == 3 | myMergedData$drivers_ethnicity == 4 | myMergedData$drivers_ethnicity > 5, 3, 99)))) # Other

myMergedData["study_area_code"] <- ifelse(myMergedData$study_area == "abilene_", 1,
                                          ifelse(myMergedData$study_area == "bcs_", 2,
                                                 ifelse(myMergedData$study_area == "cc_", 3,
                                                        ifelse(myMergedData$study_area == "ep_", 4,
                                                               ifelse(myMergedData$study_area == "hous_", 5,
                                                                      ifelse(myMergedData$study_area == "midland_odessa_", 6,
                                                                             ifelse(myMergedData$study_area == "san_ang_", 7,
                                                                                    ifelse(myMergedData$study_area == "sd_", 0,    # REFERENT
                                                                                           ifelse(myMergedData$study_area == "ta_", 8,
                                                                                                  ifelse(myMergedData$study_area == "vic_", 9,
                                                                                                         ifelse(myMergedData$study_area == "wf_", 10, 99)))))))))))




### 3. Convert variable codes to set referent group(s) which defaults to the lowest value
myMergedData$drivers_sex[which(myMergedData$drivers_sex == 2)] <- 0                     # female was 2 now 0
myMergedData$drivers_employment[which(myMergedData$drivers_employment == 2)] <- 0        # unemployed (i.e. no) was 2 now 0

### 4. Convert categorical variables to factors.
myMergedData$drivers_sex <- factor(myMergedData$drivers_sex)                      # 0 = female, 1 = male (coding changed to set referent group)
myMergedData$drivers_employment <- factor(myMergedData$drivers_employment)         # 0 = unemployed, 1 = employed
myMergedData$drivers_ethnicity <- factor(myMergedData$drivers_ethnicity)              # 1 = Black/African American, 2 = Hispanic/Mexican American, 3 = Asian/Pacific Islander, 4 = Native American, 5 = White/Caucasian, 96 = Other, 98 = Don't Know, 99 = Refused

myMergedData$speeding_10pct_ff_dur <- factor(myMergedData$speeding_10pct_ff_dur)   # 0 = excluded, 1 = included
myMergedData$speeding_20pct_ff_dur <- factor(myMergedData$speeding_20pct_ff_dur)   # 0 = excluded, 1 = included
    
myMergedData$trip_occupancy_type <- factor(myMergedData$trip_occupancy_type)       # 0 = 0 persons, 1 = 1 persons, and 2 = 2 or more persons
myMergedData$child_occupancy_type <- factor(myMergedData$child_occupancy_type)      # 0 = 0 children, 1 = 1 child, and 2 = 2 or more children

myMergedData$driver_w_child <- factor(myMergedData$driver_w_child)
myMergedData$driver_w_adult <- factor(myMergedData$driver_w_adult)

myMergedData$ethnicity_code <- factor(myMergedData$ethnicity_code)
myMergedData$study_area_code <- factor(myMergedData$study_area_code)


### 5. Data subsets

### Older drivers 65+ years old data subset
older_driver_data <- subset(myMergedData, (older_driver == 1 & trash_flag == 0),
                            select = c(samnum_gpsvehidx_gpstrpidx, unique_driver_id, speeding_20pct_ff_dur, drivers_sex, drivers_age, 
                                       drivers_ethnicity, drivers_employment, study_area, trip_occupancy_type, child_occupancy_type, 
                                       driver_w_child, driver_w_adult, ethnicity_code, study_area_code, trash_flag))

### Younger drivers between 16 to 24 years old data subset
younger_driver_data <- subset(myMergedData, (younger_driver == 1 & drivers_age >= 16 & trash_flag == 0),
                              select = c(samnum_gpsvehidx_gpstrpidx, unique_driver_id, speeding_20pct_ff_dur, drivers_sex, drivers_age, 
                                         drivers_ethnicity, drivers_employment, study_area, trip_occupancy_type, child_occupancy_type, 
                                         driver_w_child, driver_w_adult, ethnicity_code, study_area_code, trash_flag))

### All drivers 16+ years old
all_drivers_data <- subset(myMergedData, (drivers_age >= 16 & trash_flag == 0),
                           select = c(samnum_gpsvehidx_gpstrpidx, unique_driver_id, speeding_20pct_ff_dur, drivers_sex, drivers_age, 
                                      drivers_ethnicity, drivers_employment, study_area, trip_occupancy_type, child_occupancy_type, 
                                      driver_w_child, driver_w_adult, ethnicity_code, study_area_code, trash_flag))



### Export dataset to CSV file
#write.csv(myMergedData, file = "myMergedData_071318.csv", na = "", row.names = FALSE)
#write.csv(older_driver_data, file = "older_driver_data_071318.csv", na = "", row.names = FALSE)
#write.csv(younger_driver_data, file = "younger_driver_data_071318.csv", na = "", row.names = FALSE)
#write.csv(all_drivers_data, file = "all_drivers_data_071318.csv", na = "", row.names = FALSE)

### 6. 2-Way cross tabulation
CrossTable(older_driver_data$driver_w_child, older_driver_data$speeding_20pct_ff_dur)


###----------------------------------------------------
####### 5. Perform binomial logistic regression ####### 
###----------------------------------------------------

###  Older Drivers
###---------------------------------  

# Cluster robust standard errors (https://www.rdocumentation.org/packages/miceadds/versions/2.12-24/topics/lm.cluster)
myclusterlogit1 <- glm.cluster(data = older_driver_data, formula = speeding_20pct_ff_dur ~ ethnicity_code + study_area_code + drivers_employment + drivers_age + drivers_sex + driver_w_adult, cluster = "unique_driver_id", family = "binomial")

# Odds ratios and 95% CI
exp(cbind(OR = coef(myclusterlogit), confint(myclusterlogit)))

###  Younger Drivers
###---------------------------------  

# Cluster robust standard errors 
myclusterlogit2 <- glm.cluster(data = younger_driver_data, formula = speeding_20pct_ff_dur ~ study_area_code + driver_w_adult + driver_w_child, cluster = "unique_driver_id", family = "binomial")

# Odds ratios and 95% CI
exp(cbind(OR = coef(myclusterlogit2), confint(myclusterlogit2)))


###  All Drivers with a child passenger
###---------------------------------  

# Cluster robust standard errors 
myclusterlogit3 <- glm.cluster(data = all_drivers_data, formula = speeding_20pct_ff_dur ~ driver_w_child + driver_w_adult + ethnicity_code + study_area_code + drivers_employment + drivers_age + drivers_sex + trip_occupancy_type, cluster = "unique_driver_id", family = "binomial")

# Odds ratios and 95% CI
exp(cbind(OR = coef(myclusterlogit3), confint(myclusterlogit3)))

