library(IRdisplay)
display_html(
'<script>
code_show=true;
function code_toggle() {
if (code_show){
$(\'div.input\').hide();
} else {
$(\'div.input\').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>'
)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
load("ROC_soa2.rda")
load("xgb_param_err.rda")
load("resultsSoA2.rda")
library(ggplot2)
The data-set analyzed is Long term Care Incidence dataset.
The variables used for the analysis are:
We eliminated all the records where the elimination period was unknown.
The goal of this analysis is to predict the probability of any claim (between nursing home, assisted living facility and home health care). However, even after considering all the three categories together, the dataset is still highly unbalanced, with only 1.71% observations representing claims in the whole dataset.
As the data are highly umbalanced, the use of standard performance measures, such as accuracy, is not considered appropriate. For example, even the naive classifier that classifies all the claims to be false, achieves accuracy >98%.
For this reason, we use the Receiver Operating Characteristic (ROC) curve. The ROC curve is created by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings.
To compare the different models, we compare the area under the curve (AUC). A value of the AUC closer to 1 indicates a better fit.
The function below calculates the AUC and will be called later.
# compute the true positive rate and false positive rate given the predicted classes
TPR_FPR <- function(predictions){
TruePositive <- sum(predictions & dataTest'$'Claim)
FalsePositive <- sum(predictions) - TruePositive
FalseNegatve <- sum(dataTest'$'Claim) - TruePositive
TrueNegative <- length(predictions) - TruePositive - FalseNegatve - FalsePositive
FPR <- FalsePositive / (FalsePositive + TrueNegative)
TPR <- TruePositive / (TruePositive + FalseNegatve)
c(FPR,TPR)
}
# create ROC curves given the predicted probabilities of claim from the model
ROC <- function(predictions){
totalPoints <- 200
numIntervals <- 100
quantilePoints <- quantile(predictions, probs = seq(0,1,by = 1 / numIntervals))
# define tresholds points
listPoints <- lapply(1:(length(quantilePoints)-1), function(i) {
seq(quantilePoints[i], quantilePoints[i+1], length.out = totalPoints / numIntervals)
})
tresholds <- rev(c(-1,0,Reduce("c", listPoints), 1))
ROC_points <- t(sapply(tresholds, function(treshold){
TPR_FPR(predictions > treshold)
}))
ROC_points
}
# compute the area under the curve given the ROC curve
AUC <- function(ROC_points){
currentArea <- 0
for(i in 2:nrow(ROC_points)){
width <- ROC_points[i,1] - ROC_points[i-1,1]
height1 <- width * ROC_points[i-1,2]
height2 <- width * (ROC_points[i,2] - ROC_points[i-1,2]) / 2
currentArea <- currentArea + height1 + height2
}
currentArea
}
To evaluate the performance of the models, the data-set is split into a training set and a test set and the AUC is computed on the test set.
# Load the data
load("data_LTIC.rda")
# Select Training Data (70% of data)
set.seed(1)
Train <- sample(nrow(data), floor(0.7 * nrow(data)), replace = FALSE)
dataTrain <- data[Train, ]
dataTest <- data[-Train, ]
dataTrain_cov <- dataTrain[,c("IssueAgeBucket","Underwriting_Type","Prem_Class","Gender",
"Marital_Status", "Region","PolicyYear","AVG_EP","Claim")]
dataTest_cov <- dataTest[,c("IssueAgeBucket","Underwriting_Type","Prem_Class","Gender",
"Marital_Status", "Region","PolicyYear","AVG_EP","Claim")]
# Create the model matrices
x.train <- model.matrix(Claim ~ IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP,
dataTrain)[,-1]
y.train <- dataTrain$Claim
x.test <- model.matrix(Claim ~ IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP,
dataTest)[,-1]
First, we apply the classic GLM model, obtaining a value for the AUC of 0.837929.
glm_model <- glm(Claim ~ IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP,
data = dataTrain,
family = binomial(link = "logit"))
glm_prediction <- predict(glm_model, newdata = dataTest, type = "response")
ROC_glm <- ROC(glm_prediction)
(AUC_glm <- AUC(ROC_glm))
Next, we apply the GLM model to the data-set augmented with all the two-way interactions.
glm_model_2 <- glm(Claim ~ (IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP)^2,
data = dataTrain,
family = binomial(link = "logit"))
glm_prediction_2 <- predict(glm_model_2, newdata = dataTest, type = "response")
ROC_glm_2 <- ROC(glm_prediction_2)
(AUC_glm_2 <- AUC(ROC_glm_2))
The new value of the AUC is 0.843197.
To improve the performance of the GLM, we apply regularization methods, such as LASSO.
We run the method for different values of the penalization parameter $\lambda$.
library(glmnet)
# Using the value lambda = 1.013009e-05
lasso_model <- glmnet(x.train, y.train,
lambda = 1.013009e-05,
family = "binomial")
lasso_predictions <- predict(lasso_model, newx = x.test, type = "response")
ROC_lasso <- ROC(lasso_predictions)
(AUC_lasso <- AUC(ROC_lasso))
#Find below the code used to perform the grid search
lambdas <- exp(-seq(10,20,by = 0.5))
AUCs_lasso <- rep(NA, length(lambdas))
for (i in seq_along(lambdas)) {
lambda <- lambdas[i]
lasso_model <- glmnet(x.train, y.train,
lambda = lambda,
family = "binomial")
lasso_predictions <- predict(lasso_model, newx = x.test, type = "response")
ROC_lasso <- ROC(lasso_predictions)
AUC_lasso <- AUC(ROC_lasso)
AUCs_lasso[i] <- AUC_lasso
}
The best value found for $\lambda$ is 1.013009e-05, which correspond to an AUC of 0.8379056.
As for the GLM, we also apply the LASSO to the data-set augmented with the two-way interactions.
x.train2 <- model.matrix(Claim ~ (IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP)^2,
dataTrain)[,-1]
x.test2 <- model.matrix(Claim ~ (IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP)^2,
dataTest)[,-1]
lasso_model_2 <- glmnet(x.train2, y.train,
lambda = 1.494534e-05,
family = "binomial")
lasso_predictions_2 <- predict(lasso_model_2, newx = x.test2, type = "response")
ROC_lasso_2 <- ROC(lasso_predictions_2)
(AUC_lasso_2 <- AUC(ROC_lasso_2))
The value of $\lambda$ which gives the highest AUC is 1.494534e-05, corresponding to an AUC of 0.8432995, improving the results from the LASSO achieved with the standard dataset.
No variable is found to be exactly $0$ in the fitted model.
After the GLM we apply the CART model. As the data are quite unbalanced, growing a tree in the standard way would lead to a tree with only one node where all the data are classified as not claim. To avoid this behaviour, we use a different loss matrix in the algorithm, giving more weights to claims classified as not claim.
Moreover, we do not prune the tree as this would lead to a tree with a single node.
library(rpart)
rpart_model <- rpart(Claim ~ IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP,
data = dataTrain,
minsplit = 100,
cp = 0.000001,
parms=list(split="information",
loss=matrix(c(0,1,
lossvalue,0), # this value is used for tuning the algorithm
byrow=TRUE, nrow=2)),
method="class")
rpart_prediction <- predict(rpart_model, newdata = dataTest)
rpart_prediction <- rpart_prediction[,2]
ROC_rpart <- ROC(rpart_prediction)
(AUC_rpart <- AUC(ROC_rpart))
# Find below the code used to perform the grid search
# losses_value <- c(1,5,10,15,20,30)
# AUCs_rpart <- rep(NA, length(losses_value))
#
# for (i in seq_along(losses_value)) {
#
# print(i)
# loss_Value <- losses_value[i]
#
# rpart_model <- rpart(Claim ~ IssueAgeBucket + Underwriting_Type + Prem_Class +
# Marital_Status + Region + PolicyYear + Gender + AVG_EP,
# data = dataTrain,
# minsplit = 100,
# cp = 0.000001,
# parms=list(split="information", loss=matrix(c(0,1,loss_Value,0), byrow=TRUE, nrow=2)),
# method="class")
#
# rpart_prediction <- predict(rpart_model, newdata = dataTest)
# rpart_prediction <- rpart_prediction[,2]
#
# ROC_rpart <- ROC(rpart_prediction)
# (AUC_rpart <- AUC(ROC_rpart))
#
# AUCs_rpart[i] <- AUC_rpart
#
# }
We tune the algorithm by varying the weight given to the claims classified as not claim. The AUC for different values of the loss parameter are shown below.
library(repr)
options(repr.plot.width=5, repr.plot.height=4)
ggplot(rpart_AUCs, aes(x = Loss, y = AUC)) + geom_point() + geom_line() +
scale_x_continuous(breaks = rpart_AUCs[,1], labels = rpart_AUCs[,1],
minor_breaks = c(),
name = gsub("\\."," ",colnames(rpart_AUCs)[1])) +
scale_y_continuous(breaks = seq(min(rpart_AUCs[,2]),max(rpart_AUCs[,2]),length.out = 5),
minor_breaks = c(),
labels = round(seq(min(rpart_AUCs[,2]),max(rpart_AUCs[,2]),length.out = 5),6)) +
theme(axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey"))
library(repr)
options(repr.plot.width=6.5, repr.plot.height=3)
ggplot(rpart_AUCs, aes(x = Loss, y = AUC)) + geom_point() + geom_line() +
scale_x_continuous(breaks = rpart_AUCs[,1], labels = rpart_AUCs[,1],
minor_breaks = c(),
name = gsub("\\."," ",colnames(rpart_AUCs)[1])) +
scale_y_continuous(breaks = seq(min(rpart_AUCs[,2]),max(rpart_AUCs[,2]),length.out = 5),
minor_breaks = c(),
labels = round(seq(min(rpart_AUCs[,2]),max(rpart_AUCs[,2]),length.out = 5),6)) +
theme(axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey"))
The best value found for the AUC is 0.836824.
After the CART, we apply a bagging procedure to improve its performance. The idea behind using these methods is to average a low-bias high-variance model like trees to reduce their variance.
As in the CART case, also here we need to take into account the degree of imbalance in the dataset by changing the prior probabilities of each class in the algorithm.
A further improvement over bagging is random forest. In random forests, in order to de-correlate the trees, at every split of each tree only a subset of $m$ variables is chosen. A split is attempted if at least $n_{min}$ observations are in each node. For this reason, bagging (using trees) can be seen as a special case of random forests (specifically, when the value $m$ is chosen as the number of all the variables).
The number of trees has been chosen by determining the number after which the MSE does not vary considerably anymore if running the algorithm again.
library(Rborist)
rb_Model <- Rborist(dataTrain[,-which(names(dataTrain) %in% c("Claim"))], factor(dataTrain[,c("Claim")]),
nTree = 200, # number of trees
classWeight = c(0.01,0.99), # prior probability of the two classes
minNode = 5000, # minimum number of observations in node to attempt a split
predFixed = 3 # parameter m explained above
)
rb_predictions <- predict(rb_Model,
newdata = dataTest[,-which(names(dataTrain) %in% c("Claim"))],
ctgCensus = "prob")
rb_predictions <- rb_predictions$prob[,2]
ROC_rb <- ROC(rb_predictions)
(AUC_rb <- AUC(ROC_rb))
# Find below the code used to perform the grid search
# mtrys <- c(1,3,5,8)
# minNodes <- c(20000,40000)
# AUCs_rb <- matrix(NA, nrow = length(mtrys), ncol = length(minNodes))
#
# for(i in seq_along(mtrys)){
# for (j in seq_along(minNodes)) {
#
# mtry <- mtrys[i]
# minNode <- minNodes[j]
#
# rb_Model <- Rborist(dataTrain[,-which(names(dataTrain) %in% c("Claim"))], factor(dataTrain[,c("Claim")]),
# nTree = 200,
# classWeight = c(0.001,0.099),
# minNode = minNode,
# predFixed = mtry)
#
# rb_predictions <- predict(rb_Model,
# newdata = dataTest[,-which(names(dataTest) %in% c("Claim"))],
# ctgCensus = "prob")
# rb_predictions <- rb_predictions$prob[,2]
#
# ROC_rb <- ROC(rb_predictions)
# AUC_rb <- AUC(ROC_rb)
#
# AUCs_rb[i,j] <- AUC_rb
#
# }
# }
library(repr)
options(repr.plot.width=5.5, repr.plot.height=3)
ggplot(AUCs_rb_df, aes(x = Minimum.nodes, y = Error, group = as.factor(Mtry), color = as.factor(Mtry))) +
geom_point() + geom_line() +
scale_x_continuous(breaks = AUCs_rb_df[,2], labels = AUCs_rb_df[,2],
minor_breaks = c(),
name = "minNode") +
scale_y_continuous(breaks = seq(min(AUCs_rb_df[,3]),max(AUCs_rb_df[,3]),length.out = 5),
minor_breaks = c(), labels = round(seq(min(AUCs_rb_df[,3]),max(AUCs_rb_df[,3]),length.out = 5),6)) +
theme(axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey")) + guides(color=guide_legend(title="predFixed"))
The largest value achieved for the AUC is 0.8329288, which does not present an improvement over CART.
Gradient Boosting is a sum of weak learners (usually trees) estimated sequentially. In each step of the algorithm a weak learner is fitted to the current residuals of the model. The process is performed until a maximum number $M$ of trees is grown. The impact of each tree in the model is shrunk by a parameter $\lambda$, in order to slow down the learning of the algorithm as the number of steps in the sequence increases.
The parameters we tune are the number of trees $M$, the ratio between $\lambda$ and the number of trees, and the depth of each individual tree.
library(xgboost)
xgb_model <- xgboost(data = x.train,
label = y.train,
objective = "binary:logistic",
nrounds = 2000, # number of trees
eta = 0.3, # shrinkage parameter
max_depth = 6 # maximum depth of each single tree
)
xgb_predictions <- predict(xgb_model, ntreelimit = 2000, x.test)
ROC_xgb <- ROC(xgb_predictions)
AUC_xgb <- AUC(ROC_xgb)
# Find below the code used to perform the grid search
# etasratio <- c(10, 25, 80)
# max_depths <- c(4, 5, 6)
# ntrees <- c(100, 250, 500, 750, 1000, 2000)
# AUCs_xgb <- array(NA, dim = c(length(etasratio), length(max_depths), length(ntrees)))
#
# for(i in seq_along(etasratio)){
# for(j in seq_along(max_depths)){
#
# eta <- etasratio[i] / max(ntrees)
# max_depth <- max_depths[j]
#
# print(paste0(i," - ",j))
#
# xgb_model <- xgboost(data = x.train,
# label = as.logical(y.train),
# objective = "binary:logistic",
# nrounds = max(ntrees),
# eta = eta,
# max_depth = max_depth,
# verbose = 0
# )
#
# for(k in seq_along(ntrees)){
#
# ntree <- ntrees[k]
#
# xgb_predictions <- predict(xgb_model, ntreelimit = ntree, x.test)
#
# ROC_xgb <- ROC(xgb_predictions)
# AUC_xgb <- AUC(ROC_xgb)
#
# AUCs_xgb[i,j, k] <- AUC_xgb
#
# print(AUC_xgb)
# }
# }
# }
library(repr)
options(repr.plot.width=6, repr.plot.height=10)
multiplot(p1,p2,p3, cols = 1)
The largest value found is 0.8538123
We now investigate the importance of other secondary parameters, such as:
Instead of searching over a grid ranging over all the parameters, which would be too expensive computationally, we start from the best configuration found above ($\lambda$ = 0.15, maximum depth = 4 and $M$ = 250) and we change only one individual parameter at a time to test its individual effect on the AUC.
Below we present the plot of the AUC of several configurations of each parameter. The default parameter is shown as a red line.
library(repr)
options(repr.plot.width=7.5, repr.plot.height=6)
p_gamma <- ggplot(xgb_gamma_err, aes(x = Gamma, y = AUC)) + geom_point(size = 2) + geom_line() +
ylab("AUC") + xlab("Gamma") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12, face = "bold", vjust = 0.5),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey")) +
geom_vline(aes(xintercept = 0), size = 1, color = "red") +
ggtitle("Gamma parameter")
p_subs <- ggplot(xgb_subs_err, aes(x = Subsample, y = AUC)) + geom_point(size = 2) + geom_line() +
ylab("AUC") + xlab("Subsample") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12, face = "bold", vjust = 0.5),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey")) +
geom_vline(aes(xintercept = 1), size = 1, color = "red") +
ggtitle("Subsamples parameter")
p_colsubs <- ggplot(xgb_colsubs_err, aes(x = Subsample, y = AUC)) + geom_point(size = 2) + geom_line() +
ylab("AUC") + xlab("Subsample") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12, face = "bold", vjust = 0.5),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey")) +
geom_vline(aes(xintercept = 1), size = 1, color = "red") +
ggtitle("Column subsamples parameter")
p_mcw <- ggplot(xgb_mcw_err, aes(x = min_child_weights, y = AUC)) + geom_point(size = 2) + geom_line() +
ylab("AUC") + xlab("Minimum child weight") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey"))+
geom_vline(aes(xintercept = 1), size = 1, color = "red") +
ggtitle("Minimum child weight parameter")
multiplot(p_gamma, p_mcw, p_subs, p_colsubs, cols = 2)
The parameter colsample_bytree brings the largest improvement, followed by the subsample parameter and the minimum child weight. The gamma parameter does not bring any change to the model.
Multivariate Adaptive Regression Splines (MARS) can be seen as a generalization of forward regression, suited for high dimensional problems.
The final model is of the form:
$$ f(x) = \beta_0 + \sum_{i=1}^m \beta_i h_i(x) $$
where the functions $h_i(x)$ are from the set $C = \{ (X_j - t)_+,(t - X_j)_+ \}$, where $X_j$ is an independent variable, or product of functions in this set.
The model is trained like a forward regression, adding terms from $C$ and products of terms of $C$ with functions of the current basis.
As the full model is expected to overfit the data, some terms are removed from the model at the end of the fitting. In this respect, this procedure can be seen as similar to the CART procedure.
The main parameter to tune is the maximum degree of interaction allowed in the products between functions.
library(earth)
mars_model <- earth(factor(Claim) ~ IssueAgeBucket + Underwriting_Type + Prem_Class +
Marital_Status + Region + PolicyYear + Gender + AVG_EP,
data = dataTrain,
degree = 3, glm = list(family=binomial(link = "logit"))
)
mars_prediction <- predict(mars_model, newdata = dataTest, type = "response")
ROC_mars <- ROC(mars_prediction)
(AUC_mars <- AUC(ROC_mars))
library(repr)
options(repr.plot.width=5, repr.plot.height=3.5)
ggplot(mars_AUCs, aes(x = Degree, y = AUC)) + geom_point() + geom_line() +
scale_x_continuous(breaks = mars_AUCs[,1], labels = mars_AUCs[,1],
minor_breaks = c(),
name = gsub("\\."," ",colnames(mars_AUCs)[1])) +
scale_y_continuous(breaks = seq(min(mars_AUCs[,2]),max(mars_AUCs[,2]),length.out = 5),
minor_breaks = c(),
labels = round(seq(min(mars_AUCs[,2]),max(mars_AUCs[,2]),length.out = 5),6)) +
theme(axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey"))
The largest value achieved is 0.8323743.
Next, we plot the different ROC curves for the best configuration of each model.
First we plot the ROC curves for GLM (black), LASSO (red) and MARS (green). As the result from LASSO and GLM are very similar, the curves overlap.
library(repr)
options(repr.plot.width=7, repr.plot.height=6)
ROC_1 <- ROC_lasso
ROC_2 <- ROC_glm
ROC_3 <- ROC_mars
ggplot(data = NULL, aes(x = ROC_1[,1], y = ROC_1[,2])) + xlim(c(0,1)) + ylim(c(0,1)) + geom_point() + geom_line() +
geom_point(data = NULL, aes(x = ROC_2[,1], y = ROC_2[,2]), color = "red") +
geom_line(data = NULL, aes(x = ROC_2[,1], y = ROC_2[,2]), color = "red") +
geom_point(data = NULL, aes(x = ROC_3[,1], y = ROC_3[,2]), color = "green") +
geom_line(data = NULL, aes(x = ROC_3[,1], y = ROC_3[,2]), color = "green") +
theme(axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey")) +
xlab("True Positive Rate") + ylab("False positive rate")
Next, we show the curves for CART (green), Gradient Boosting(black) and Random Forests(red). The Gradient Boosting is the model which performs better.
library(repr)
options(repr.plot.width=7, repr.plot.height=6)
ROC_1 <- ROC_xgb
ROC_2 <- ROC_rb
ROC_3 <- ROC_rpart
ggplot(data = NULL, aes(x = ROC_1[,1], y = ROC_1[,2])) + xlim(c(0,1)) + ylim(c(0,1)) + geom_point() + geom_line() +
geom_point(data = NULL, aes(x = ROC_2[,1], y = ROC_2[,2]), color = "red") +
geom_line(data = NULL, aes(x = ROC_2[,1], y = ROC_2[,2]), color = "red") +
geom_point(data = NULL, aes(x = ROC_3[,1], y = ROC_3[,2]), color = "green") +
geom_line(data = NULL, aes(x = ROC_3[,1], y = ROC_3[,2]), color = "green") +
theme(axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 9),
panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey")) +
xlab("True Positive Rate") + ylab("False positive rate")
In this dataset the performances are similar for all the model apart from gradient boosting, which outperforms the other methods.
We did not perform Bayesian variable selection methods as the algorithm is too slow when used with binomial data, hence not practical.