---
title: "Resampling"
output:
    pdf_document:
        fig_height: 3
---

# Manual Splitting -- what could possibly go wrong?

```{r}
library(mlr)

task = makeClassifTask(data = iris, target = "Species")
learner = makeLearner("classif.randomForest")
```

## The Good

```{r}
model = train(learner, task, subset = c(1:30, 51))
model

predictions = predict(model, task = task, subset = 31:50)
predictions

performance(predictions, measures = acc)
calculateConfusionMatrix(predictions)
```

## The Bad

```{r}
model = train(learner, task, subset = 1:100)
model

predictions = predict(model, task = task, subset = 101:150)
predictions

performance(predictions, measures = acc)
calculateConfusionMatrix(predictions)
```

## The Ugly


```{r}
model = train(learner, task, subset = c(1:45, 51:95, 101:110))
model

predictions = predict(model, task = task, subset = c(46:50, 96:100, 111:150))
predictions

performance(predictions, measures = acc)
calculateConfusionMatrix(predictions)
```

# Automatic Splitting

# Holdout

```{r}
rdesc = makeResampleDesc(method = "Holdout", split = 2/3)
result = resample(learner, task, rdesc, measures = acc)

predictions = getRRPredictions(result)
performance(predictions, measures = acc)
calculateConfusionMatrix(predictions)
```

Using the `holdout` function:

```{r}
result = holdout(learner, task, measures = acc, split = 2/3)

predictions = getRRPredictions(result)
performance(predictions, measures = acc)
calculateConfusionMatrix(predictions)
```

### Stratification

```{r}
result = holdout(learner, task, measures = acc, split = 2/3, stratify = TRUE)

predictions = getRRPredictions(result)
performance(predictions, measures = acc)
calculateConfusionMatrix(predictions)
```

# Subsample

```{r}
rdesc = makeResampleDesc(method = "Subsample", iters = 10, split = 2/3, predict = "both")
# or use the "subsample" function
result = resample(learner, task, rdesc, measures = acc)
```

## Details for each iteration

```{r}
getRRPredictions(result)
predictionList = getRRPredictionList(result)
sapply(predictionList$test, performance, measures = acc)
# this is also directly available in the resample result
result$measures.test
lapply(predictionList$test, calculateConfusionMatrix)
```

```{r}
library(ggplot2)
ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc),
                  set = rep(c("train", "test"), each = nrow(result$measures.train))),
    aes(set, acc)) + geom_boxplot() + ylim(0, 1)
```

# Bootstrap

```{r}
rdesc = makeResampleDesc(method = "Bootstrap", predict = "both")
# or use the "bootstrapOOB" function
result = resample(learner, task, rdesc, measures = acc)
```

```{r}
ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc),
                  set = rep(c("train", "test"), each = nrow(result$measures.train))),
    aes(set, acc)) + geom_boxplot() + ylim(0, 1)
```

# Cross-Validation

```{r}
rdesc = makeResampleDesc(method = "CV", predict = "both")
# or use the "crossval" function
result = resample(learner, task, rdesc, measures = acc)
```

```{r}
ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc),
                  set = rep(c("train", "test"), each = nrow(result$measures.train))),
    aes(set, acc)) + geom_boxplot() + ylim(0, 1)
```

## Leave-One-Out Cross-Valdation

```{r}
# or set number of folds to n
rdesc = makeResampleDesc(method = "LOO", predict = "both")
result = resample(learner, task, rdesc, measures = acc)
```

```{r}
ggplot(data.frame(acc = c(result$measures.train$acc, result$measures.test$acc),
                  set = rep(c("train", "test"), each = nrow(result$measures.train))),
    aes(set, acc)) + geom_boxplot() + ylim(0, 1)
```

# Blocking

```{r}
task = makeClassifTask(data = iris, target = "Species", blocking = iris$Species)
rdesc = makeResampleDesc(method = "CV", iters = 3)
result = resample(learner, task, rdesc, measures = acc)
```