--- title: "Resampling" output: pdf_document: fig_height: 3 --- # Manual Splitting -- what could possibly go wrong? ```{r} library(mlr) task = makeClassifTask(data = iris, target = "Species") learner = makeLearner("classif.randomForest") ``` ## The Good ```{r} model = train(learner, task, subset = c(1:30, 51)) model predictions = predict(model, task = task, subset = 31:50) predictions performance(predictions, measures = acc) calculateConfusionMatrix(predictions) ``` ## The Bad ```{r} model = train(learner, task, subset = 1:100) model predictions = predict(model, task = task, subset = 101:150) predictions performance(predictions, measures = acc) calculateConfusionMatrix(predictions) ``` ## The Ugly ```{r} model = train(learner, task, subset = c(1:45, 51:95, 101:110)) model predictions = predict(model, task = task, subset = c(46:50, 96:100, 111:150)) predictions performance(predictions, measures = acc) calculateConfusionMatrix(predictions) ``` # Automatic Splitting # Holdout ```{r} rdesc = makeResampleDesc(method = "Holdout", split = 2/3) result = resample(learner, task, rdesc, measures = acc) predictions = getRRPredictions(result) performance(predictions, measures = acc) calculateConfusionMatrix(predictions) ``` Using the `holdout` function: ```{r} result = holdout(learner, task, measures = acc, split = 2/3) predictions = getRRPredictions(result) performance(predictions, measures = acc) calculateConfusionMatrix(predictions) ``` ### Stratification ```{r} result = holdout(learner, task, measures = acc, split = 2/3, stratify = TRUE) predictions = getRRPredictions(result) performance(predictions, measures = acc) calculateConfusionMatrix(predictions) ``` # Subsample ```{r} rdesc = makeResampleDesc(method = "Subsample", iters = 10, split = 2/3, predict = "both") # or use the "subsample" function result = resample(learner, task, rdesc, measures = acc) ``` ## Details for each iteration ```{r} getRRPredictions(result) predictionList = getRRPredictionList(result) sapply(predictionList$test, performance, measures = acc) # this is also directly available in the resample result result$measures.test lapply(predictionList$test, calculateConfusionMatrix) ``` ```{r} library(ggplot2) ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc), set = rep(c("train", "test"), each = nrow(result$measures.train))), aes(set, acc)) + geom_boxplot() + ylim(0, 1) ``` # Bootstrap ```{r} rdesc = makeResampleDesc(method = "Bootstrap", predict = "both") # or use the "bootstrapOOB" function result = resample(learner, task, rdesc, measures = acc) ``` ```{r} ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc), set = rep(c("train", "test"), each = nrow(result$measures.train))), aes(set, acc)) + geom_boxplot() + ylim(0, 1) ``` # Cross-Validation ```{r} rdesc = makeResampleDesc(method = "CV", predict = "both") # or use the "crossval" function result = resample(learner, task, rdesc, measures = acc) ``` ```{r} ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc), set = rep(c("train", "test"), each = nrow(result$measures.train))), aes(set, acc)) + geom_boxplot() + ylim(0, 1) ``` ## Leave-One-Out Cross-Valdation ```{r} # or set number of folds to n rdesc = makeResampleDesc(method = "LOO", predict = "both") result = resample(learner, task, rdesc, measures = acc) ``` ```{r} ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc), set = rep(c("train", "test"), each = nrow(result$measures.train))), aes(set, acc)) + geom_boxplot() + ylim(0, 1) ``` # Blocking ```{r} task = makeClassifTask(data = iris, target = "Species", blocking = iris$Species) rdesc = makeResampleDesc(method = "CV", iters = 3) result = resample(learner, task, rdesc, measures = acc) ```