Basic Ensemble Model in R

Jun 26, 2021 5 min read coding, data science

Overview

This project was designed to showcase the Ensemble Methodology for classification. Ensembles can provide better predictive capabilities than individual classification techniques; however, they come at the expense of increased processing power and time requirements. This is a proof of concept model that does not split the data into training and testing sets.

Library Necessary Packages

library(mlbench)        # Dataset

library(e1071)          # SVM, NB models
library(nnet)           # Neural Net model
library(rpart)          # RPart, LOOCV models
library(MASS)           # QDA model 
library(klaR)           # RDA model 
library(randomForest)   # Random Forest model

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(caret)          # Confusion Matrix

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(ggplot2)        # Better Plots
library(rpart.plot)     # Better RPart Plots
library(wesanderson)    # Better Color Schemes

Load Data

This section imports the Breast Cancer dataset, omitting NA rows and removing the ID column, while keeping a copy of the IDs so they can be appended to the results table.

data(BreastCancer)
BreastCancer <- na.omit(BreastCancer) 

BreastCancer.id <- BreastCancer[, 1]

BreastCancer <- BreastCancer[,-1]

Models

This section runs a variety of different classification models all which return a true or false classification.

Support Vector Machine Model

mysvm <- svm(Class ~ . , BreastCancer)
mysvm.pred <- predict(mysvm, BreastCancer)

length(mysvm.pred)

## [1] 683

length(BreastCancer$Class)

## [1] 683

table(mysvm.pred, BreastCancer$Class)

##            
## mysvm.pred  benign malignant
##   benign       431         8
##   malignant     13       231

Naive Bayes Model

mynb <- NaiveBayes(Class ~ . , BreastCancer)
mynb.pred <- predict(mynb, BreastCancer)
table(mynb.pred$class, BreastCancer$Class)

##            
##             benign malignant
##   benign       431         3
##   malignant     13       236

Neural Net Model

mynnet <- nnet(Class ~ . , BreastCancer, size = 1)

## # weights:  83
## initial  value 444.875707 
## iter  10 value 70.908039
## iter  20 value 47.042231
## iter  30 value 38.495429
## iter  40 value 37.410696
## iter  50 value 33.178027
## iter  60 value 33.159982
## iter  70 value 32.581901
## iter  80 value 28.939637
## iter  90 value 28.933749
## iter 100 value 28.929784
## final  value 28.929784 
## stopped after 100 iterations

mynnet.pred <- predict(mynnet, BreastCancer, type = "class")
table(mynnet.pred, BreastCancer$Class)

##            
## mynnet.pred benign malignant
##   benign       441         2
##   malignant      3       237

Decision Tree Model

mytree <- rpart(Class ~ . , BreastCancer)
rpart.plot(mytree, box.palette = "GnRd")

mytree.pred <- predict(mytree, BreastCancer, type = "class")
table(mytree.pred, BreastCancer$Class)

##            
## mytree.pred benign malignant
##   benign       431         9
##   malignant     13       230

Leave-1-Out Cross Validation (LOOCV) Model

ans <- numeric(length(BreastCancer[, 1]))

for (i in 1:length(BreastCancer[, 1])) {
 myloocv <- rpart(Class ~ . , BreastCancer[-i, ])
 myloocv.pred <- predict(myloocv, BreastCancer[i, ], type = "class", se.fit = FALSE)
 ans[i] <- myloocv.pred
}

myloocv.pred <- factor(ans, labels = levels(BreastCancer$Class))
table(myloocv.pred, BreastCancer$Class)

##             
## myloocv.pred benign malignant
##    benign       430        20
##    malignant     14       219

Quadratic Discriminant Analysis Model

BreastCancer.num <- BreastCancer

for(c in 1:ncol(BreastCancer.num)){
  BreastCancer.num[, c] <- cbind(as.numeric(BreastCancer.num[, c]))
}

myqda <- qda(Class ~ . , BreastCancer.num)
myqda.pred <- predict(myqda, BreastCancer.num)

table(myqda.pred$class, BreastCancer.num$Class)

##    
##       1   2
##   1 422   6
##   2  22 233

Regularized Discriminant Analysis Model

myrda <- rda(Class ~ . , BreastCancer)
myrda.pred <- predict(myrda, BreastCancer)

table(myrda.pred$class, BreastCancer$Class)

##            
##             benign malignant
##   benign       433         3
##   malignant     11       236

Random Forests Model

myrf <- randomForest(Class ~ . , BreastCancer)
myrf.pred <- predict(myrf, BreastCancer)
table(myrf.pred, BreastCancer$Class)

##            
## myrf.pred   benign malignant
##   benign       444         0
##   malignant      0       239

Aggregate Predictions

This section combines predictions from all other sections into one dataframe. It then converts all columns to numeric to allow for a Majority Rule column to be created. This column calculates the average of each row’s values, then assigns a 1 or 2 based on whether the value is above or below 1.5. In the event of a tie (4 models with 1, 4 models with 2) the calculation takes the conservative approach of classifying as malignant.

Predictions.num <- data.frame(mysvm.pred, mynb.pred$class, as.factor(mynnet.pred), mytree.pred, myloocv.pred, myqda.pred$class, myrda.pred$class, myrf.pred)

for(c in 1:ncol(Predictions.num)){
  Predictions.num[, c] <- as.numeric(Predictions.num[, c])
}

denom <- ncol(Predictions.num)

Predictions.num$MajRule <- ifelse(rowSums(Predictions.num) / denom < 1.5, 1, 2)

Output

This section converts the predictions back to factors, assigns names to each column of the Predictions dataframe, and creates a barchart with a count of each model’s predictions. Finally, a results dataframe is created that has the ID of the sample, and the majority prediction.

Predictions <- Predictions.num

prednames <- c("SVM", "Naive Bayes", "Neural Net", "Decision Tree", "LOOCV", "QDA", "RDA", "Random Forest", "Majority Vote")
type <- c("benign", "malignant")

for(c in 1:ncol(Predictions)){
  names(Predictions)[c] <- prednames[c]
  Predictions[, c] <- as.factor(Predictions[, c])
  levels(Predictions[, c]) <- type
  
  print(ggplot(data = Predictions, 
               aes(x = Predictions[, c],
                   fill = Predictions[, c])) +
    geom_bar(width = 0.25) +
    scale_fill_manual(values = wes_palette(n = 3, name = "Royal1"), name = "Prediction") +
    xlab(paste("Model Type: ", 
               prednames[c])) +
    geom_text(stat = 'count', 
              aes(label = ..count..), 
              vjust = -0.4,
              hjust = NA))
}

Results <- data.frame(BreastCancer.id, Predictions$`Majority Vote`)

confusionMatrix(Predictions$`Majority Vote`, BreastCancer$Class)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       433         2
##   malignant     11       237
##                                           
##                Accuracy : 0.981           
##                  95% CI : (0.9677, 0.9898)
##     No Information Rate : 0.6501          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9585          
##                                           
##  Mcnemar's Test P-Value : 0.0265          
##                                           
##             Sensitivity : 0.9752          
##             Specificity : 0.9916          
##          Pos Pred Value : 0.9954          
##          Neg Pred Value : 0.9556          
##              Prevalence : 0.6501          
##          Detection Rate : 0.6340          
##    Detection Prevalence : 0.6369          
##       Balanced Accuracy : 0.9834          
##                                           
##        'Positive' Class : benign          
##

coding r data science