Using Workflow Sets to screen and compare Model-Recipe Combinations for Bank Loan Classification

towards-data-science

This post was originally published by Murray Gillin at Towards Data Science

#Split Data for Testing and Training
set.seed(101)
loan_split <- initial_split(train, prop = 0.8, strata = Loan_Status)
#Initialise Seven Models for Screening
nb_loan <- 
naive_Bayes(smoothness = tune(), Laplace = tune()) %>% 
set_engine("klaR") %>% 
set_mode("classification")logistic_loan <- 
logistic_reg(penalty = tune(), mixture = tune()) %>% 
set_engine("glmnet") %>% 
set_mode("classification")dt_loan <- decision_tree(cost_complexity = tune(), tree_depth = tune(), min_n = tune()) %>% 
set_engine("rpart") %>% 
set_mode("classification")rf_loan <- 
rand_forest(mtry = tune(), trees = tune(), min_n = tune()) %>% 
set_engine("ranger") %>% 
set_mode("classification")knn_loan <- nearest_neighbor(neighbors = tune(), weight_func = tune(), dist_power = tune()) %>% 
set_engine("kknn") %>% 
set_mode("classification")svm_loan <- 
svm_rbf(cost = tune(), rbf_sigma = tune(), margin = tune()) %>% 
set_engine("kernlab") %>% 
set_mode("classification")xgboost_loan <- boost_tree(mtry = tune(), trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), sample_size = tune())  %>% 
set_engine("xgboost") %>% 
set_mode("classification")
#Initialise Four Recipes
recipe_1 <- 
recipe(Loan_Status ~., data = training(loan_split)) %>% 
step_rm(Loan_ID) %>%
step_mutate(Credit_History = if_else(Credit_History == 1, 1, -1,0)) %>% 
step_scale(all_numeric_predictors(), -Credit_History) %>% 
step_impute_bag(Gender, 
Married, 
Dependents, 
Self_Employed, 
Loan_Amount, 
Loan_Amount_Term) %>% 
step_dummy(all_nominal_predictors())recipe_2 <- 
recipe(Loan_Status ~., data = training(loan_split)) %>% 
step_rm(Loan_ID) %>%
step_mutate(Credit_History = if_else(Credit_History == 1, 1, -1,0)) %>% 
step_scale(all_numeric_predictors(), -Credit_History) %>% 
step_impute_bag(Gender, 
Married, 
Dependents, 
Self_Employed, 
Loan_Amount, 
Loan_Amount_Term) %>% 
step_dummy(all_nominal_predictors()) %>% 
step_smote(Loan_Status)

recipe_3 <-
recipe(Loan_Status ~., data = training(loan_split)) %>%
step_rm(Loan_ID) %>%
step_mutate(Credit_History = if_else(Credit_History == 1, 1, -1,0)) %>%
step_scale(all_numeric_predictors(), -Credit_History) %>%
step_impute_mean(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())

recipe_4 <- 
recipe(Loan_Status ~., data = training(loan_split)) %>% 
step_rm(Loan_ID) %>%
step_mutate(Credit_History = if_else(Credit_History == 1, 1, -1,0)) %>% 
step_scale(all_numeric_predictors(), -Credit_History) %>%  
step_impute_mean(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>% 
step_dummy(all_nominal_predictors()) %>% 
step_zv(all_predictors()) %>% 
step_smote(Loan_Status)
#Prep and Bake Training and Test Datasets
loan_train <- recipe_1 %>% prep() %>% bake(new_data = NULL)
loan_test <- recipe_1 %>% prep() %>% bake(testing(loan_split))
#Generate Correlation Visualisation
loan_train %>% bind_rows(loan_test) %>% 
mutate(Loan_Status = if_else(Loan_Status == "Y",1,0)) %>% 
correlate() %>%
rearrange() %>% 
shave() %>% 
rplot(print_cor = T,.order = "alphabet") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
scale_color_viridis_c() +
labs(title = "Correlation Plot for Trained Loan Dataset")
Correlation Plot for Trained Loan Dataset (Image by Author)
#Generate List of Recipes
recipe_list <- 
list(Recipe1 = recipe_1, Recipe2 = recipe_2, Recipe3 = recipe_3, Recipe4 = recipe_4)#Generate List of Model Types
model_list <- 
list(Random_Forest = rf_loan, SVM = svm_loan, Naive_Bayes = nb_loan, Decision_Tree = dt_loan, Boosted_Trees = xgboost_loan, KNN = knn_loan, Logistic_Regression = logistic_loan)
model_set <- workflow_set(preproc = recipe_list, models = model_list, cross = T)
set.seed(2)
train_resamples <- bootstraps(training(loan_split), strata = Loan_Status)doParallel::registerDoParallel(cores = 12)
all_workflows <- 
model_set %>% workflow_map(resamples = train_resamples, 
verbose = TRUE)
Output from workflow_map (Image by Author)
#Visualise Performance Comparison of Workflows
collect_metrics(all_workflows) %>% 
separate(wflow_id, into = c("Recipe", "Model_Type"), sep = "_", remove = F, extra = "merge") %>% 
filter(.metric == "accuracy") %>% 
group_by(wflow_id) %>% 
filter(mean == max(mean)) %>% 
group_by(model) %>% 
select(-.config) %>% 
distinct() %>%
ungroup() %>% 
mutate(Workflow_Rank =  row_number(-mean),
.metric = str_to_upper(.metric)) %>%
ggplot(aes(x=Workflow_Rank, y = mean, shape = Recipe, color = Model_Type)) +
geom_point() +
geom_errorbar(aes(ymin = mean-std_err, ymax = mean+std_err)) +
theme_minimal()+
scale_colour_viridis_d() +
labs(title = "Performance Comparison of Workflow Sets", x = "Workflow Rank", y = "Accuracy", color = "Model Types", shape = "Recipes")
Performance Comparison of All Workflows (Image by Author)
doParallel::registerDoParallel(cores = 12)
set.seed(246)
acc_model_eval <- perf_mod(all_workflows, metric = "accuracy", iter = 5000)
Sample output from perf_mod (Image by Author)
#Extract Results from Posterior Analysis and Visualise Distributionsacc_model_eval %>% 
tidy() %>% 
mutate(model = fct_inorder(model)) %>% 
ggplot(aes(x=posterior)) +
geom_histogram(bins = 50) +
theme_minimal() +
facet_wrap(~model, nrow = 7, ncol = 6) +
labs(title = "Comparison of Posterior Distributions of Model Recipe Combinations", x = expression(paste("Posterior for Mean Accuracy")), y = "")
Posterior Distributions of Mean Accuracy for all Workflows (Image by Author)
#Compare Two Models - Difference in Means
mod_compare <- contrast_models(acc_model_eval,
list_1 = "Recipe1_Decision_Tree",
list_2 = "Recipe1_Boosted_Trees")a1 <- mod_compare %>% 
as_tibble() %>% 
ggplot(aes(x=difference)) +
geom_histogram(bins = 50, col = "white", fill = "#73D055FF")+
geom_vline(xintercept = 0, lty = 2) +
theme_minimal()+
scale_fill_viridis_b()+
labs(x= "Posterior for Mean Difference in Accuracy", y="", title = "Posterior Mean Difference Recipe1_Decision_Tree & Recipe3_Boosted_Trees")a2 <- acc_model_eval %>% 
tidy() %>% mutate(model = fct_inorder(model)) %>% 
filter(model %in% c("Recipe1_Boosted_Trees", "Recipe1_Decision_Tree")) %>% 
ggplot(aes(x=posterior)) +
geom_histogram(bins = 50, col = "white", fill = "#73D055FF") +
theme_minimal()+
scale_colour_viridis_b() +
facet_wrap(~model, nrow = 2, ncol = 1) +
labs(title = "Comparison of Posterior Distributions of Model Recipe Combinations", x = expression(paste("Posterior for Mean Accuracy")), y = "")a2/a1
Individual Mean Accuracy Posterior Distributions, and Their Difference (Image by Author)
mod_compare %>% summary()
mean
0.001371989 #Difference in means between posterior distributions
probability
0.5842 #Proportion of Posterior of Mean Difference > 0
summary(mod_compare, size = 0.02)
pract_equiv
0.9975
#Pluck and modify underlying tibble from autoplot()
autoplot(acc_model_eval, type = "ROPE", size = 0.02) %>% 
pluck("data") %>% 
mutate(rank = row_number(-pract_equiv)) %>% 
arrange(rank) %>% 
separate(model, into = c("Recipe", "Model_Type"), sep = "_", remove = F, extra = "merge") %>% 
ggplot(aes(x=rank, y= pract_equiv, color = Model_Type, shape = Recipe)) +
geom_point(size = 5) +
theme_minimal() +
scale_colour_viridis_d() +
labs(y= "Practical Equivalance", x = "Workflow Rank", size = "Probability of Practical Equivalence", color = "Model Type", title = "Practical Equivalence of Workflow Sets", subtitle = "Calculated Using an Effect Size of 0.02")
Practical Equivalence of Workflows With Effects Size of 0.02 (Image by Author)
#Pull best performing hyperparameter set from workflow_map object
best_result <- all_workflows %>% 
pull_workflow_set_result("Recipe1_Decision_Tree") %>% 
select_best(metric = "accuracy")#Finalise workflow object with best parameters
dt_wf <- all_workflows %>% 
pull_workflow("Recipe1_Decision_Tree") %>% 
finalize_workflow(best_result)#Fit workflow object to training data and predict using test dataset
dt_res <- 
dt_wf %>%
fit(training(loan_split)) %>% 
predict(new_data = testing(loan_split)) %>% 
bind_cols(loan_test) %>% 
mutate(.pred_class = fct_infreq(.pred_class),
Loan_Status = fct_infreq(Loan_Status))#Calculate accuracy of prediction
accuracy(dt_res, truth = Loan_Status, estimate = .pred_class)
Model Accuracy Against Test Data (Image by Author)
Confusion Matrix for Predictions (Image by Author)
#Fit and Extract Fit from Workflow Object
dt_wf_fit <- 
dt_wf %>% 
fit(training(loan_split))dt_fit <- 
dt_wf_fit %>% 
pull_workflow_fit()#Generate Decision Tree Plot Using rpart.plot package
rpart.plot::rpart.plot(dt_fit$fit)
Decision Tree Diagram for Model Decision Making (Image by Author)
Spread the word

This post was originally published by Murray Gillin at Towards Data Science

Related posts